CoronaVirus Prediction

In [1]:
# Get data from Github
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error
import pandas as pd

#url_1 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv'
url_1 = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed = pd.read_csv(url_1, error_bad_lines=False)

#url_2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv'
url_2 = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
death = pd.read_csv(url_2, error_bad_lines=False)

#url_3 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv'
url_3 = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recover = pd.read_csv(url_3, error_bad_lines=False)

# fix region names
confirmed['Country/Region']= confirmed['Country/Region'].str.replace("Mainland China", "China")
confirmed['Country/Region']= confirmed['Country/Region'].str.replace("US", "United States")

death['Country/Region']= death['Country/Region'].str.replace("Mainland China", "China")
death['Country/Region']= death['Country/Region'].str.replace("US", "United States")

recover['Country/Region']= recover['Country/Region'].str.replace("Mainland China", "China")
recover['Country/Region']= recover['Country/Region'].str.replace("US", "United States")
In [2]:
confirmed.iloc[:,:]
Out[2]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/20/20 3/21/20 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 24 24 40 40 74 84 94 110 110 120
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 70 76 89 104 123 146 174 186 197 212
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 90 139 201 230 264 302 367 409 454 511
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 75 88 113 133 164 188 224 267 308 334
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 1 2 2 3 3 3 4 4 5 7
5 NaN Antigua and Barbuda 17.060800 -61.796400 0 0 0 0 0 0 ... 1 1 1 3 3 3 7 7 7 7
6 NaN Argentina -38.416100 -63.616700 0 0 0 0 0 0 ... 128 158 266 301 387 387 502 589 690 745
7 NaN Armenia 40.069100 45.038200 0 0 0 0 0 0 ... 136 160 194 235 249 265 290 329 407 424
8 Australian Capital Territory Australia -35.473500 149.012400 0 0 0 0 0 0 ... 6 9 19 32 39 39 53 62 71 77
9 New South Wales Australia -33.868800 151.209300 0 0 0 0 3 4 ... 353 436 669 669 818 1029 1219 1405 1617 1791
10 Northern Territory Australia -12.463400 130.845600 0 0 0 0 0 0 ... 3 3 5 5 6 6 12 12 15 15
11 Queensland Australia -28.016700 153.400000 0 0 0 0 0 0 ... 184 221 259 319 397 443 493 555 625 656
12 South Australia Australia -34.928500 138.600700 0 0 0 0 0 0 ... 50 67 100 134 170 170 235 257 287 299
13 Tasmania Australia -41.454500 145.970700 0 0 0 0 0 0 ... 10 16 22 28 28 36 47 47 62 66
14 Victoria Australia -37.813600 144.963100 0 0 0 0 1 1 ... 121 229 355 355 411 466 520 574 685 769
15 Western Australia Australia -31.950500 115.860500 0 0 0 0 0 0 ... 64 90 120 140 175 175 231 231 278 311
16 NaN Austria 47.516200 14.550100 0 0 0 0 0 0 ... 2388 2814 3582 4474 5283 5588 6909 7657 8271 8788
17 NaN Azerbaijan 40.143100 47.576900 0 0 0 0 0 0 ... 44 53 65 72 87 93 122 165 182 209
18 NaN Bahamas 25.034300 -77.396300 0 0 0 0 0 0 ... 3 4 4 4 5 5 9 10 10 11
19 NaN Bahrain 26.027500 50.550000 0 0 0 0 0 0 ... 285 305 334 377 392 419 458 466 476 499
20 NaN Bangladesh 23.685000 90.356300 0 0 0 0 0 0 ... 20 25 27 33 39 39 44 48 48 48
21 NaN Barbados 13.193900 -59.543200 0 0 0 0 0 0 ... 5 6 14 17 18 18 18 24 26 33
22 NaN Belarus 53.709800 27.953400 0 0 0 0 0 0 ... 69 76 76 81 81 86 86 94 94 94
23 NaN Belgium 50.833300 4.000000 0 0 0 0 0 0 ... 2257 2815 3401 3743 4269 4937 6235 7284 9134 10836
24 NaN Benin 9.307700 2.315800 0 0 0 0 0 0 ... 2 2 2 5 6 6 6 6 6 6
25 NaN Bhutan 27.514200 90.433600 0 0 0 0 0 0 ... 2 2 2 2 2 2 2 3 3 4
26 NaN Bolivia -16.290200 -63.588700 0 0 0 0 0 0 ... 15 19 24 27 29 32 43 61 74 81
27 NaN Bosnia and Herzegovina 43.915900 17.679100 0 0 0 0 0 0 ... 89 93 126 136 166 176 191 237 258 323
28 NaN Brazil -14.235000 -51.925300 0 0 0 0 0 0 ... 793 1021 1546 1924 2247 2554 2985 3417 3904 4256
29 NaN Brunei 4.535300 114.727700 0 0 0 0 0 0 ... 78 83 88 91 104 109 114 115 120 126
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
223 NaN United Kingdom 55.378100 -3.436000 0 0 0 0 0 0 ... 3983 5018 5683 6650 8077 9529 11658 14543 17089 19522
224 NaN Uruguay -32.522800 -55.765800 0 0 0 0 0 0 ... 94 110 158 162 162 189 217 238 274 304
225 NaN United States 37.090200 -95.712900 1 1 2 2 5 5 ... 19100 25489 33276 43847 53740 65778 83836 101657 121478 140886
226 NaN Uzbekistan 41.377500 64.585300 0 0 0 0 0 0 ... 33 43 43 46 50 60 75 88 104 144
227 NaN Venezuela 6.423800 -66.589700 0 0 0 0 0 0 ... 42 70 70 77 84 91 107 107 119 119
228 NaN Vietnam 16.000000 108.000000 0 2 2 2 2 2 ... 91 94 113 123 134 141 153 163 174 188
229 NaN Zambia -15.416700 28.283300 0 0 0 0 0 0 ... 2 2 3 3 3 12 16 22 28 29
230 NaN Zimbabwe -20.000000 30.000000 0 0 0 0 0 0 ... 1 3 3 3 3 3 3 5 7 7
231 Diamond Princess Canada 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
232 NaN Dominica 15.415000 -61.371000 0 0 0 0 0 0 ... 0 0 1 2 2 7 11 11 11 11
233 NaN Grenada 12.116500 -61.679000 0 0 0 0 0 0 ... 0 0 1 1 1 1 7 7 7 9
234 NaN Mozambique -18.665695 35.529562 0 0 0 0 0 0 ... 0 0 1 1 3 5 7 7 8 8
235 NaN Syria 34.802075 38.996815 0 0 0 0 0 0 ... 0 0 1 1 1 5 5 5 5 9
236 NaN Timor-Leste -8.874217 125.727539 0 0 0 0 0 0 ... 0 0 1 1 1 1 1 1 1 1
237 NaN Belize 13.193900 -59.543200 0 0 0 0 0 0 ... 0 0 0 1 1 2 2 2 2 2
238 Recovered Canada 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
239 NaN Laos 19.856270 102.495496 0 0 0 0 0 0 ... 0 0 0 0 2 3 6 6 8 8
240 NaN Libya 26.335100 17.228331 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 1 3 8
241 NaN West Bank and Gaza 31.952200 35.233200 0 0 0 0 0 0 ... 47 48 52 59 59 59 84 91 98 109
242 NaN Guinea-Bissau 11.803700 -15.180400 0 0 0 0 0 0 ... 0 0 0 0 0 2 2 2 2 2
243 NaN Mali 17.570692 -3.996166 0 0 0 0 0 0 ... 0 0 0 0 0 2 4 11 18 18
244 NaN Saint Kitts and Nevis 17.357822 -62.782998 0 0 0 0 0 0 ... 0 0 0 0 0 2 2 2 2 2
245 Northwest Territories Canada 64.825500 -124.845700 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 1 1
246 Yukon Canada 64.282300 -135.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 3 3 4 4
247 NaN Kosovo 42.602636 20.902977 0 0 0 0 0 0 ... 0 0 0 0 0 0 71 86 91 94
248 NaN Burma 21.916200 95.956000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 8 8 10
249 Anguilla United Kingdom 18.220600 -63.068600 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 2 2
250 British Virgin Islands United Kingdom 18.420700 -64.640000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 2 2
251 Turks and Caicos Islands United Kingdom 21.694000 -71.797900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 4 4
252 NaN MS Zaandam 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 2 2

253 rows × 72 columns

Get Population

In [3]:
population=pd.read_csv('/home/notebookuser/notebooks/covid19/population.csv', sep=',', encoding='latin1') 
confirmed=pd.merge(confirmed, population,how='left' ,on=['Province/State','Country/Region'])
death=pd.merge(death, population,how='left' ,on=['Province/State','Country/Region'])
recover=pd.merge(recover, population,how='left' ,on=['Province/State','Country/Region'])
In [4]:
# merge region confirmed + death + recover
confirmed['region']=confirmed['Country/Region'].map(str)+'_'+confirmed['Province/State'].map(str)
death['region']=death['Country/Region'].map(str)+'_'+death['Province/State'].map(str)
recover['region']=recover['Country/Region'].map(str)+'_'+recover['Province/State'].map(str)
confirmed.iloc[:,:]
Out[4]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 Population region
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 40 40 74 84 94 110 110 120 35,530,000 Afghanistan_nan
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 89 104 123 146 174 186 197 212 NaN Albania_nan
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 201 230 264 302 367 409 454 511 41,320,000 Algeria_nan
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 113 133 164 188 224 267 308 334 NaN Andorra_nan
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 2 3 3 3 4 4 5 7 NaN Angola_nan
5 NaN Antigua and Barbuda 17.060800 -61.796400 0 0 0 0 0 0 ... 1 3 3 3 7 7 7 7 NaN Antigua and Barbuda_nan
6 NaN Argentina -38.416100 -63.616700 0 0 0 0 0 0 ... 266 301 387 387 502 589 690 745 NaN Argentina_nan
7 NaN Armenia 40.069100 45.038200 0 0 0 0 0 0 ... 194 235 249 265 290 329 407 424 NaN Armenia_nan
8 Australian Capital Territory Australia -35.473500 149.012400 0 0 0 0 0 0 ... 19 32 39 39 53 62 71 77 NaN Australia_Australian Capital Territory
9 New South Wales Australia -33.868800 151.209300 0 0 0 0 3 4 ... 669 669 818 1029 1219 1405 1617 1791 7,544,000 Australia_New South Wales
10 Northern Territory Australia -12.463400 130.845600 0 0 0 0 0 0 ... 5 5 6 6 12 12 15 15 NaN Australia_Northern Territory
11 Queensland Australia -28.016700 153.400000 0 0 0 0 0 0 ... 259 319 397 443 493 555 625 656 5,071,000 Australia_Queensland
12 South Australia Australia -34.928500 138.600700 0 0 0 0 0 0 ... 100 134 170 170 235 257 287 299 1,677,000 Australia_South Australia
13 Tasmania Australia -41.454500 145.970700 0 0 0 0 0 0 ... 22 28 28 36 47 47 62 66 NaN Australia_Tasmania
14 Victoria Australia -37.813600 144.963100 0 0 0 0 1 1 ... 355 355 411 466 520 574 685 769 92,141 Australia_Victoria
15 Western Australia Australia -31.950500 115.860500 0 0 0 0 0 0 ... 120 140 175 175 231 231 278 311 NaN Australia_Western Australia
16 NaN Austria 47.516200 14.550100 0 0 0 0 0 0 ... 3582 4474 5283 5588 6909 7657 8271 8788 8,822,000 Austria_nan
17 NaN Azerbaijan 40.143100 47.576900 0 0 0 0 0 0 ... 65 72 87 93 122 165 182 209 NaN Azerbaijan_nan
18 NaN Bahamas 25.034300 -77.396300 0 0 0 0 0 0 ... 4 4 5 5 9 10 10 11 NaN Bahamas_nan
19 NaN Bahrain 26.027500 50.550000 0 0 0 0 0 0 ... 334 377 392 419 458 466 476 499 1,493,000 Bahrain_nan
20 NaN Bangladesh 23.685000 90.356300 0 0 0 0 0 0 ... 27 33 39 39 44 48 48 48 NaN Bangladesh_nan
21 NaN Barbados 13.193900 -59.543200 0 0 0 0 0 0 ... 14 17 18 18 18 24 26 33 NaN Barbados_nan
22 NaN Belarus 53.709800 27.953400 0 0 0 0 0 0 ... 76 81 81 86 86 94 94 94 NaN Belarus_nan
23 NaN Belgium 50.833300 4.000000 0 0 0 0 0 0 ... 3401 3743 4269 4937 6235 7284 9134 10836 11,400,000 Belgium_nan
24 NaN Benin 9.307700 2.315800 0 0 0 0 0 0 ... 2 5 6 6 6 6 6 6 NaN Benin_nan
25 NaN Bhutan 27.514200 90.433600 0 0 0 0 0 0 ... 2 2 2 2 2 3 3 4 NaN Bhutan_nan
26 NaN Bolivia -16.290200 -63.588700 0 0 0 0 0 0 ... 24 27 29 32 43 61 74 81 NaN Bolivia_nan
27 NaN Bosnia and Herzegovina 43.915900 17.679100 0 0 0 0 0 0 ... 126 136 166 176 191 237 258 323 NaN Bosnia and Herzegovina_nan
28 NaN Brazil -14.235000 -51.925300 0 0 0 0 0 0 ... 1546 1924 2247 2554 2985 3417 3904 4256 209,300,000 Brazil_nan
29 NaN Brunei 4.535300 114.727700 0 0 0 0 0 0 ... 88 91 104 109 114 115 120 126 NaN Brunei_nan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
223 NaN United Kingdom 55.378100 -3.436000 0 0 0 0 0 0 ... 5683 6650 8077 9529 11658 14543 17089 19522 66,440,000 United Kingdom_nan
224 NaN Uruguay -32.522800 -55.765800 0 0 0 0 0 0 ... 158 162 162 189 217 238 274 304 NaN Uruguay_nan
225 NaN United States 37.090200 -95.712900 1 1 2 2 5 5 ... 33276 43847 53740 65778 83836 101657 121478 140886 NaN United States_nan
226 NaN Uzbekistan 41.377500 64.585300 0 0 0 0 0 0 ... 43 46 50 60 75 88 104 144 NaN Uzbekistan_nan
227 NaN Venezuela 6.423800 -66.589700 0 0 0 0 0 0 ... 70 77 84 91 107 107 119 119 NaN Venezuela_nan
228 NaN Vietnam 16.000000 108.000000 0 2 2 2 2 2 ... 113 123 134 141 153 163 174 188 95,540,000 Vietnam_nan
229 NaN Zambia -15.416700 28.283300 0 0 0 0 0 0 ... 3 3 3 12 16 22 28 29 NaN Zambia_nan
230 NaN Zimbabwe -20.000000 30.000000 0 0 0 0 0 0 ... 3 3 3 3 3 5 7 7 NaN Zimbabwe_nan
231 Diamond Princess Canada 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 NaN Canada_Diamond Princess
232 NaN Dominica 15.415000 -61.371000 0 0 0 0 0 0 ... 1 2 2 7 11 11 11 11 NaN Dominica_nan
233 NaN Grenada 12.116500 -61.679000 0 0 0 0 0 0 ... 1 1 1 1 7 7 7 9 NaN Grenada_nan
234 NaN Mozambique -18.665695 35.529562 0 0 0 0 0 0 ... 1 1 3 5 7 7 8 8 NaN Mozambique_nan
235 NaN Syria 34.802075 38.996815 0 0 0 0 0 0 ... 1 1 1 5 5 5 5 9 NaN Syria_nan
236 NaN Timor-Leste -8.874217 125.727539 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 1 NaN Timor-Leste_nan
237 NaN Belize 13.193900 -59.543200 0 0 0 0 0 0 ... 0 1 1 2 2 2 2 2 NaN Belize_nan
238 Recovered Canada 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 NaN Canada_Recovered
239 NaN Laos 19.856270 102.495496 0 0 0 0 0 0 ... 0 0 2 3 6 6 8 8 NaN Laos_nan
240 NaN Libya 26.335100 17.228331 0 0 0 0 0 0 ... 0 0 1 1 1 1 3 8 NaN Libya_nan
241 NaN West Bank and Gaza 31.952200 35.233200 0 0 0 0 0 0 ... 52 59 59 59 84 91 98 109 NaN West Bank and Gaza_nan
242 NaN Guinea-Bissau 11.803700 -15.180400 0 0 0 0 0 0 ... 0 0 0 2 2 2 2 2 NaN Guinea-Bissau_nan
243 NaN Mali 17.570692 -3.996166 0 0 0 0 0 0 ... 0 0 0 2 4 11 18 18 NaN Mali_nan
244 NaN Saint Kitts and Nevis 17.357822 -62.782998 0 0 0 0 0 0 ... 0 0 0 2 2 2 2 2 NaN Saint Kitts and Nevis_nan
245 Northwest Territories Canada 64.825500 -124.845700 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 1 NaN Canada_Northwest Territories
246 Yukon Canada 64.282300 -135.000000 0 0 0 0 0 0 ... 0 0 0 0 3 3 4 4 NaN Canada_Yukon
247 NaN Kosovo 42.602636 20.902977 0 0 0 0 0 0 ... 0 0 0 0 71 86 91 94 NaN Kosovo_nan
248 NaN Burma 21.916200 95.956000 0 0 0 0 0 0 ... 0 0 0 0 0 8 8 10 NaN Burma_nan
249 Anguilla United Kingdom 18.220600 -63.068600 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 2 NaN United Kingdom_Anguilla
250 British Virgin Islands United Kingdom 18.420700 -64.640000 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 2 NaN United Kingdom_British Virgin Islands
251 Turks and Caicos Islands United Kingdom 21.694000 -71.797900 0 0 0 0 0 0 ... 0 0 0 0 0 0 4 4 NaN United Kingdom_Turks and Caicos Islands
252 NaN MS Zaandam 0.000000 0.000000 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 2 NaN MS Zaandam_nan

253 rows × 74 columns

In [5]:
# merge region death
death.iloc[175:185,:]
Out[5]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 Population region
175 NaN Norway 60.4720 8.4689 0 0 0 0 0 0 ... 7 10 12 14 14 19 23 25 5,357,000 Norway_nan
176 NaN Oman 21.0000 57.0000 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 4,636,000 Oman_nan
177 NaN Pakistan 30.3753 69.3451 0 0 0 0 0 0 ... 5 6 7 8 9 11 12 14 197,000,000 Pakistan_nan
178 NaN Panama 8.5380 -80.7821 0 0 0 0 0 0 ... 3 6 6 8 8 9 14 17 NaN Panama_nan
179 NaN Papua New Guinea -6.3150 143.9555 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 NaN Papua New Guinea_nan
180 NaN Paraguay -23.4425 -58.4438 0 0 0 0 0 0 ... 1 1 2 3 3 3 3 3 NaN Paraguay_nan
181 NaN Peru -9.1900 -75.0152 0 0 0 0 0 0 ... 5 5 7 9 9 11 16 18 NaN Peru_nan
182 NaN Philippines 13.0000 122.0000 0 0 0 0 0 0 ... 25 33 35 38 45 54 68 71 104,900,000 Philippines_nan
183 NaN Poland 51.9194 19.1451 0 0 0 0 0 0 ... 7 8 10 14 16 16 18 22 37,980,000 Poland_nan
184 NaN Portugal 39.3999 -8.2245 0 0 0 0 0 0 ... 14 23 33 43 60 76 100 119 10,290,000 Portugal_nan

10 rows × 74 columns

In [6]:
# merge region recover
recover.iloc[175:185,:]
Out[6]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 Population region
175 NaN Panama 8.5380 -80.7821 0 0 0 0 0 0 ... 0 0 1 1 2 2 2 4 NaN Panama_nan
176 NaN Papua New Guinea -6.3150 143.9555 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 NaN Papua New Guinea_nan
177 NaN Paraguay -23.4425 -58.4438 0 0 0 0 0 0 ... 0 0 0 0 0 1 1 1 NaN Paraguay_nan
178 NaN Peru -9.1900 -75.0152 0 0 0 0 0 0 ... 1 1 1 1 14 16 16 16 NaN Peru_nan
179 NaN Philippines 13.0000 122.0000 0 0 0 0 0 0 ... 17 17 20 26 28 31 35 42 104,900,000 Philippines_nan
180 NaN Poland 51.9194 19.1451 0 0 0 0 0 0 ... 1 1 1 7 7 7 7 7 37,980,000 Poland_nan
181 NaN Portugal 39.3999 -8.2245 0 0 0 0 0 0 ... 5 5 22 22 43 43 43 43 10,290,000 Portugal_nan
182 NaN Qatar 25.3548 51.1839 0 0 0 0 0 0 ... 33 33 41 41 43 43 45 48 NaN Qatar_nan
183 NaN Romania 45.9432 24.9668 0 0 0 0 0 0 ... 64 64 79 86 94 115 139 206 19,530,000 Romania_nan
184 NaN Russia 60.0000 90.0000 0 0 0 0 0 0 ... 16 16 22 29 38 45 49 64 144,500,000 Russia_nan

10 rows × 74 columns

In [7]:
confirmed.iloc[175:185,:]
Out[7]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/22/20 3/23/20 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 Population region
175 NaN Norway 60.4720 8.4689 0 0 0 0 0 0 ... 2385 2621 2863 3084 3369 3755 4015 4284 5,357,000 Norway_nan
176 NaN Oman 21.0000 57.0000 0 0 0 0 0 0 ... 55 66 84 99 109 131 152 167 4,636,000 Oman_nan
177 NaN Pakistan 30.3753 69.3451 0 0 0 0 0 0 ... 776 875 972 1063 1201 1373 1495 1597 197,000,000 Pakistan_nan
178 NaN Panama 8.5380 -80.7821 0 0 0 0 0 0 ... 313 345 345 443 558 674 786 901 NaN Panama_nan
179 NaN Papua New Guinea -6.3150 143.9555 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 1 NaN Papua New Guinea_nan
180 NaN Paraguay -23.4425 -58.4438 0 0 0 0 0 0 ... 22 22 27 37 41 52 56 59 NaN Paraguay_nan
181 NaN Peru -9.1900 -75.0152 0 0 0 0 0 0 ... 363 395 416 480 580 635 671 852 NaN Peru_nan
182 NaN Philippines 13.0000 122.0000 0 0 0 0 0 0 ... 380 462 552 636 707 803 1075 1418 104,900,000 Philippines_nan
183 NaN Poland 51.9194 19.1451 0 0 0 0 0 0 ... 634 749 901 1051 1221 1389 1638 1862 37,980,000 Poland_nan
184 NaN Portugal 39.3999 -8.2245 0 0 0 0 0 0 ... 1600 2060 2362 2995 3544 4268 5170 5962 10,290,000 Portugal_nan

10 rows × 74 columns

Create Time Series + Plots

In [8]:
def create_ts(df):
  ts=df
  ts=ts.drop(['Province/State', 'Country/Region','Lat', 'Long',' Population '], axis=1)
  ts.set_index('region')
  ts=ts.T
  ts.columns=ts.loc['region']
  ts=ts.drop('region')
  ts=ts.fillna(0)
  ts=ts.reindex(sorted(ts.columns), axis=1)
  return (ts)
In [9]:
## JOAO - Fix - Drop Duplicates # Keep Last # Issue With Data source Change from John Hopkins institute

ts=create_ts(confirmed.drop_duplicates(subset=['region'], keep='last', inplace=False) )
ts_d=create_ts(death.drop_duplicates(subset=['region'], keep='last', inplace=False) )
ts_rec=create_ts(recover.drop_duplicates(subset=['region'], keep='last', inplace=False) )
In [13]:
import matplotlib.pyplot as plt
p=ts.reindex(ts.max().sort_values(ascending=False).index, axis=1)
p.iloc[:,:1].plot(marker='*',figsize=(20,8)).set_title('Daily Total Confirmed - Top World Region ',fontdict={'fontsize': 22})
p.iloc[:,2:25].plot(marker='*',figsize=(20,8)).set_title('Daily Total Confirmed - Major 2nd Areas',fontdict={'fontsize': 22})

p_d=ts_d.reindex(ts.max().sort_values(ascending=False).index, axis=1)
p_d.iloc[:,:1].plot(marker='*',figsize=(20,8)).set_title('Daily Total Death - Top World Region',fontdict={'fontsize': 22})
p_d.iloc[:,2:25].plot(marker='*',figsize=(20,8)).set_title('Daily Total Death - Major 2nd Areas',fontdict={'fontsize': 22})

p_r=ts_rec.reindex(ts.max().sort_values(ascending=False).index, axis=1)
p_r.iloc[:,:1].plot(marker='*',figsize=(20,8)).set_title('Daily Total Recoverd - Top World Region',fontdict={'fontsize': 22})
p_r.iloc[:,2:25].plot(marker='*',figsize=(20,8)).set_title('Daily Total Recoverd - Major 2nd Areas',fontdict={'fontsize': 22})
Out[13]:
Text(0.5, 1.0, 'Daily Total Recoverd - Major 2nd Areas')

Kalman Filter With R

In [14]:
# Create data for R script
ts_r=ts.reset_index()
ts_r=ts_r.rename(columns = {'index':'date'})
ts_r['date']=pd.to_datetime(ts_r['date'] ,errors ='coerce')
ts_r.to_csv(r'/home/notebookuser/notebooks/covid19/korean/ts_r.csv')
In [15]:
#!pip install rpy2
In [16]:
import rpy2
%load_ext rpy2.ipython
In [17]:
%%R

#install.packages('pracma')
#install.packages('Metrics')
#install.packages('readr')
#install.packages('reshape')
NULL
In [18]:
%%R
Sys.setenv(TZ='GMT')
Sys.timezone()
[1] "GMT"
In [19]:
%%R
require(pracma)
require(Metrics)
require(readr)
Sys.setenv(TZ='GMT')
all<- read_csv("/home/notebookuser/notebooks/covid19/ts_2_r.csv")
all$X1<-NULL
date<-all[,1]
date[nrow(date) + 1,1] <-all[nrow(all),1]+1
pred_all<-NULL
for (n in 2:ncol(all)-1) {
  Y<-ts(data = all[n+1], start = 1, end =nrow(all)+1)  
  sig_w<-0.01
  w<-sig_w*randn(1,100) # acceleration which denotes the fluctuation (Q/R) rnorm(100, mean = 0, sd = 1)
  sig_v<-0.01
  v<-sig_v*randn(1,100)   
  t<-0.45
  phi<-matrix(c(1,0,t,1),2,2)
  gama<-matrix(c(0.5*t^2,t),2,1)
  H<-matrix(c(1,0),1,2)
  #Kalman
  x0_0<-p0_0<-matrix(c(0,0),2,1)
  p0_0<-matrix(c(1,0,0,1),2,2)
  Q<-0.01
  R<-0.01
  X<-NULL
  X2<-NULL
  pred<-NULL
  for (i in 0:nrow(all)) {
    namp <-paste("p", i+1,"_",i, sep = "")
    assign(namp, phi%*%(get(paste("p", i,"_",i, sep = "")))%*%t(phi)+gama%*%Q%*%t(gama))
    namk <- paste("k", i+1, sep = "")
    assign(namk,get(paste("p", i+1,"_",i, sep = ""))%*%t(H)%*%(1/(H%*%get(paste("p", i+1,"_",i, sep = ""))%*%t(H)+R)))
    namx <- paste("x", i+1,"_",i, sep = "")
    assign(namx,phi%*%get(paste("x", i,"_",i, sep = "")))
    namE <- paste("E", i+1, sep = "")
    assign(namE,Y[i+1]-H%*%get(paste("x", i+1,"_",i, sep = "")))
    namx2 <- paste("x", i+1,"_",i+1, sep = "")
    assign(namx2,get(paste("x", i+1,"_",i, sep = ""))+get(paste("k", i+1, sep = ""))%*%get(paste("E", i+1, sep = "")))
    namp2 <- paste("p", i+1,"_",i+1, sep = "")
    assign(namp2,(p0_0-get(paste("k", i+1, sep = ""))%*%H)%*%get(paste("p", i+1,"_",i, sep = "")))
    X<-rbind(X,get(paste("x", i+1,"_",i,sep = ""))[1])
    X2<-rbind(X2,get(paste("x", i+1,"_",i,sep = ""))[2])
    if(i>2){
      remove(list=(paste("p", i-1,"_",i-2, sep = "")))
      remove(list=(paste("k", i-1, sep = "")))
      remove(list=(paste("E", i-1, sep = "")))
      remove(list=(paste("p", i-2,"_",i-2, sep = "")))
      remove(list=(paste("x", i-1,"_",i-2, sep = "")))
      remove(list=(paste("x", i-2,"_",i-2, sep = "")))}
  }
  pred<-NULL
  pred<-cbind(Y,X,round(X2,4))
  pred<-as.data.frame(pred)
  pred$region<-colnames(all[,n+1])
  pred$date<-date$date
  pred$actual<-rbind(0,(cbind(pred[2:nrow(pred),1])/pred[1:nrow(pred)-1,1]-1)*100)
  pred$predict<-rbind(0,(cbind(pred[2:nrow(pred),2])/pred[1:nrow(pred)-1,2]-1)*100)
  pred$pred_rate<-(pred$X/pred$Y-1)*100
  pred$X2_change<-rbind(0,(cbind(pred[2:nrow(pred),3]-pred[1:nrow(pred)-1,3])))
  pred_all<-rbind(pred_all,pred)
}
pred_all<-cbind(pred_all[,4:5],pred_all[,1:3])
names(pred_all)[5]<-"X2"
pred_all=pred_all[with( pred_all, order(region, date)), ]
pred_all<-pred_all[,3:5]
R[write to console]: Loading required package: pracma

R[write to console]: Loading required package: Metrics

R[write to console]: Loading required package: readr

R[write to console]: Parsed with column specification:
cols(
  .default = col_double(),
  date = col_date(format = "")
)

R[write to console]: See spec(...) for full column specifications.

In [20]:
p=%R pred_all
In [21]:
############ Merge R output due to package problem
t=ts_d
t=t.stack().reset_index(name='confirmed')
t.columns=['date', 'region','confirmed']
t['date']=pd.to_datetime(t['date'] ,errors ='coerce')
t=t.sort_values(['region', 'date'])

temp=t.iloc[:,:3]
temp=temp.reset_index(drop=True)
for i in range(1,len(t)+1):
  if(temp.iloc[i,1] is not temp.iloc[i-1,1]):
    temp.loc[len(temp)+1] = [temp.iloc[i-1,0]+ pd.DateOffset(1),temp.iloc[i-1,1], 0] 
temp=temp.sort_values(['region', 'date'])
temp=temp.reset_index(drop=True)
temp['Y']=p['Y']
temp['X']=p['X']
temp['X2']=p['X2']

Pre Proccessing Data for ML Model

Extract Weather Data

In [22]:
#!pip install pyweatherbit
In [23]:
#from weatherbit.api import Api
#import json  
#import pandas as pd  
#from pandas.io.json import json_normalize 

#api_key = "26141d374d8e49d0a2e1f1254428ce8e"
#api_key ="f206579c74644c4b96a2423cb56a1687"
# #api_key ="81a581ac823849f38427fb5081cb8df8"
# #api_key ="be7aaf4ff7184347aa1ebc2a2db514fe"

#api = Api(api_key)
#api.set_granularity('daily')

# # Set the granularity of the API - Options: ['daily','hourly','3hourly']
# # Will only affect forecast requests.
#api.get_forecast(lat='Lat', lon='Lon')

Weather History

In [24]:
# ################## already done since API is limited to 500 call per day

# w=pd.DataFrame(columns=['date','region','min','max'])
# for i in range (61,len(confirmed)):
#   start_date=pd.to_datetime('2020-01-22')
#   for j in range (4,confirmed.shape[1]-2):
#     jas=api.get_history(lat=confirmed.iloc[i,2], lon=confirmed.iloc[i,3], start_date=start_date.strftime('%Y-%m-%d'),end_date=(start_date+ pd.DateOffset(days=1)).strftime('%Y-%m-%d')).json
#     # j=json_normalize(j)
#     # j=j['data']
#     # max_temp=json_normalize(j['data'])['max_temp'].values[0] # max
#     # min_temp=json_normalize(j['data'])['min_temp'].values[0]
#     try:
#       w=w.append({'date':confirmed.columns[j],'region':confirmed.iloc[i,confirmed.shape[1]-1] ,'min':json_normalize(jas['data'])['min_temp'].values[0],'max':json_normalize(jas['data'])['max_temp'].values[0]}, ignore_index=True)
#     except Exception:
#       w=w.append({'date':confirmed.columns[j],'region':confirmed.iloc[i,confirmed.shape[1]-1] ,'min':None,'max':None}, ignore_index=True)
#     start_date=start_date+ pd.DateOffset(days=1)
In [25]:
# ################## Update Recent Day Weather

# w_update=pd.DataFrame(columns=['date','region','min','max'])
# for i in range (28,len(confirmed)):
#   start_date=pd.to_datetime('2020-02-17')
#   for j in range (confirmed.shape[1]-4,confirmed.shape[1]-2):
#     jas=api.get_history(lat=confirmed.iloc[i,2], lon=confirmed.iloc[i,3], start_date=start_date.strftime('%Y-%m-%d'),end_date=(start_date+ pd.DateOffset(days=1)).strftime('%Y-%m-%d')).json
#     try:
#       w_update=w_update.append({'date':confirmed.columns[j],'region':confirmed.iloc[i,confirmed.shape[1]-1] ,'min':json_normalize(jas['data'])['min_temp'].values[0],'max':json_normalize(jas['data'])['max_temp'].values[0]}, ignore_index=True)
#     except Exception:
#       w_update=w_update.append({'date':confirmed.columns[j],'region':confirmed.iloc[i,confirmed.shape[1]-1] ,'min':None,'max':None}, ignore_index=True)
#     start_date=start_date+ pd.DateOffset(days=1)
In [26]:
# w_update.to_csv(r'w_update.csv')

Extract Weather Forecast Data

In [27]:
# ################## Forecast Weather With API - Already Done
# #forecast = api.get_forecast(lat=lat, lon=lon)

# w_forecast=pd.DataFrame(columns=['datetime','min_temp','max_temp','region'])
# for i in range (0,len(confirmed)):
#     jas=api.get_forecast(lat=confirmed.iloc[i,2], lon=confirmed.iloc[i,3]).json
#     jas=json_normalize(jas['data'])[['datetime','min_temp','max_temp']]
#     try:
#       w_forecast_temp=jas
#       w_forecast_temp['region']=confirmed.iloc[i,confirmed.shape[1]-1]
#     except Exception:
#       w_forecast_temp=pd.DataFrame(columns=['datetime','min_temp','max_temp','region'])
#     w_forecast=w_forecast.append(w_forecast_temp)
# w_forecast=w_forecast[['datetime','region','min_temp','max_temp']]
# w_forecast.columns = ['date', 'region', 'min', 'max']
# w_forecast['date']=pd.to_datetime(w_forecast['date'],format='%Y-%m-%d')
In [28]:
w=pd.read_csv('/home/notebookuser/notebooks/covid19/w.csv', sep=',', encoding='latin1')
w['date']=pd.to_datetime(w['date'],format='%d/%m/%Y')
#w['date']=pd.to_datetime(w['date'],errors ='coerce')

w_forecast=pd.read_csv('/home/notebookuser/notebooks/covid19/w_forecast.csv', sep=',', encoding='latin1')
w_forecast['date']=pd.to_datetime(w_forecast['date'],format='%d/%m/%Y')

Build Train Set Data Structure

In [29]:
t=ts
t=t.stack().reset_index(name='confirmed')
t.columns=['date', 'region','confirmed']
t['date']=pd.to_datetime(t['date'] ,errors ='coerce')
t=t.sort_values(['region', 'date'])

# Add 1 Future day for prediction
t=t.reset_index(drop=True)
for i in range(1,len(t)+1):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    t.loc[len(t)+1] = [t.iloc[i-1,0]+ pd.DateOffset(1),t.iloc[i-1,1], 0] 
t=t.sort_values(['region', 'date'])
t=t.reset_index(drop=True)
In [30]:
t['1_day_change']=t['3_day_change']=t['7_day_change']=t['1_day_change_rate']=t['3_day_change_rate']=t['7_day_change_rate']=t['last_day']=0
#
### JOAO - Fix - ipykernel_launcher.py:5: RuntimeWarning: divide by zero encountered in double_scalars
for i in range(1,len(t)):
  if(t.iloc[i,1] is t.iloc[i-2,1]):
    t.iloc[i,3]=t.iloc[i-1,2]-t.iloc[i-2,2]
    t.iloc[i,6]=((t.iloc[i-1,2] +1)/(t.iloc[i-2,2]-1 +1))*100
    t.iloc[i,9]=t.iloc[i-1,2]
  if(t.iloc[i,1] is t.iloc[i-4,1]):
    t.iloc[i,4]=t.iloc[i-1,2]-t.iloc[i-4,2]
    t.iloc[i,7]=((t.iloc[i-1,2] +1)/(t.iloc[i-4,2]-1 +1))*100
  if(t.iloc[i,1] is t.iloc[i-8,1]):
    t.iloc[i,5]=t.iloc[i-1,2]-t.iloc[i-8,2]
    t.iloc[i,8]=((t.iloc[i-1,2] +1)/(t.iloc[i-8,2]-1 +1))*100
t=t.fillna(0)  
t=t.merge(temp[['date','region', 'X']],how='left',on=['date','region'])
t=t.rename(columns = {'X':'kalman_prediction'}) 
t=t.replace([np.inf, -np.inf], 0)
### Joao - Fix NaN Kalman_Filter
#t['kalman_prediction']=round(t['kalman_prediction'])
t['kalman_prediction']=np.nan_to_num(t['kalman_prediction'])
t['kalman_prediction']=round(t['kalman_prediction'],2)
#
train=t.merge(confirmed[['region',' Population ']],how='left',on='region')
train=train.rename(columns = {' Population ':'population'})
train['population']=train['population'].str.replace(r" ", '')
train['population']=train['population'].str.replace(r",", '')
train['population']=train['population'].fillna(1)
train['population']=train['population'].astype('int32')
### JOAO - Fix - ipykernel_launcher.py:5: RuntimeWarning: divide by zero encountered in double_scalars
# train['infected_rate']=train['last_day']/train['population']*10000
train['infected_rate']=((train['last_day'] +1)/((train['population'] +1) *10000))
#
train=train.merge(w,how='left',on=['date','region'])
train=train.sort_values(['region', 'date'])
### fill missing weather 
for i in range(0,len(train)):
  if(np.isnan(train.iloc[i,13])):
    if(train.iloc[i,1] is train.iloc[i-1,1]):
      train.iloc[i,13]=train.iloc[i-1,13]
      train.iloc[i,14]=train.iloc[i-1,14]
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: RuntimeWarning: divide by zero encountered in long_scalars
  import sys
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in long_scalars
  # This is added back by InteractiveShellApp.init_path()
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: divide by zero encountered in long_scalars
  

Kalman 1 day Prediction with Evaluation

In [31]:
### JOAO - ERROR - ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Select region
region='China_Hubei'

evaluation=pd.DataFrame(columns=['region','mse','rmse','mae'])
place=0
for i in range(1,len(t)):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    ex=np.array(t.iloc[i-len(ts):i,10])
    pred=np.array(t.iloc[i-len(ts):i,2])
    evaluation=evaluation.append({'region': t.iloc[i-1,1], 'mse': np.power((ex - pred),2).mean(),'rmse':sqrt(mean_squared_error(ex,pred)),'mae': (abs(ex - pred)).mean()}, ignore_index=True)
p=t[t['region']==region][['date','region','confirmed','kalman_prediction']]
p=p.rename(columns = {'confirmed':'recoverd'})
p.iloc[len(p)-1,2]=None
p=p.set_index(['date'])
p.iloc[:,1:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Region to Change - {}'.format(p.iloc[0,0]))
print(evaluation[evaluation['region']==p.iloc[0,0]])
         region           mse          rmse       mae
68  China_Hubei  3.009469e+09  54858.624219  48547.75
In [32]:
### JOAO - ERROR - ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Select region
region='China_Shanghai'

evaluation=pd.DataFrame(columns=['region','mse','rmse','mae'])
place=0
for i in range(1,len(t)):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    ex=np.array(t.iloc[i-len(ts):i,10])
    pred=np.array(t.iloc[i-len(ts):i,2])
    evaluation=evaluation.append({'region': t.iloc[i-1,1], 'mse': np.power((ex - pred),2).mean(),'rmse':sqrt(mean_squared_error(ex,pred)),'mae': (abs(ex - pred)).mean()}, ignore_index=True)
p=t[t['region']==region][['date','region','confirmed','kalman_prediction']]
p=p.rename(columns = {'confirmed':'recoverd'})
p.iloc[len(p)-1,2]=None
p=p.set_index(['date'])
p.iloc[:,1:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Region to Change - {}'.format(p.iloc[0,0]))
print(evaluation[evaluation['region']==p.iloc[0,0]])
            region            mse        rmse         mae
80  China_Shanghai  101004.176471  317.811542  296.117647
In [33]:
region='Italy_nan'

evaluation=pd.DataFrame(columns=['region','mse','rmse','mae'])
place=0
for i in range(1,len(t)):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    ex=np.array(t.iloc[i-len(ts):i,10])
    pred=np.array(t.iloc[i-len(ts):i,2])
    evaluation=evaluation.append({'region': t.iloc[i-1,1], 'mse': np.power((ex - pred),2).mean(),'rmse':sqrt(mean_squared_error(ex,pred)),'mae': (abs(ex - pred)).mean()}, ignore_index=True)
p=t[t['region']==region][['date','region','confirmed','kalman_prediction']]
p=p.rename(columns = {'confirmed':'recoverd'})
p.iloc[len(p)-1,2]=None
p=p.set_index(['date'])
p.iloc[:,1:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Region to Change - {}'.format(p.iloc[0,0]))
print(evaluation[evaluation['region']==p.iloc[0,0]])
        region           mse          rmse           mae
146  Italy_nan  9.089757e+08  30149.224408  14758.205882
In [34]:
region='United States_nan'

evaluation=pd.DataFrame(columns=['region','mse','rmse','mae'])
place=0
for i in range(1,len(t)):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    ex=np.array(t.iloc[i-len(ts):i,10])
    pred=np.array(t.iloc[i-len(ts):i,2])
    evaluation=evaluation.append({'region': t.iloc[i-1,1], 'mse': np.power((ex - pred),2).mean(),'rmse':sqrt(mean_squared_error(ex,pred)),'mae': (abs(ex - pred)).mean()}, ignore_index=True)
p=t[t['region']==region][['date','region','confirmed','kalman_prediction']]
p=p.rename(columns = {'confirmed':'recoverd'})
p.iloc[len(p)-1,2]=None
p=p.set_index(['date'])
p.iloc[:,1:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Region to Change - {}'.format(p.iloc[0,0]))
print(evaluation[evaluation['region']==p.iloc[0,0]])
                region           mse          rmse           mae
245  United States_nan  9.348310e+08  30575.006101  10838.058824
In [35]:
region='United Kingdom_nan'

evaluation=pd.DataFrame(columns=['region','mse','rmse','mae'])
place=0
for i in range(1,len(t)):
  if(t.iloc[i,1] is not t.iloc[i-1,1]):
    ex=np.array(t.iloc[i-len(ts):i,10])
    pred=np.array(t.iloc[i-len(ts):i,2])
    evaluation=evaluation.append({'region': t.iloc[i-1,1], 'mse': np.power((ex - pred),2).mean(),'rmse':sqrt(mean_squared_error(ex,pred)),'mae': (abs(ex - pred)).mean()}, ignore_index=True)
p=t[t['region']==region][['date','region','confirmed','kalman_prediction']]
p=p.rename(columns = {'confirmed':'recoverd'})
p.iloc[len(p)-1,2]=None
p=p.set_index(['date'])
p.iloc[:,1:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Region to Change - {}'.format(p.iloc[0,0]))
print(evaluation[evaluation['region']==p.iloc[0,0]])
                 region           mse         rmse          mae
244  United Kingdom_nan  1.939025e+07  4403.435614  1712.735294

Regression - 1 Day Prediction

In [36]:
#!pip install h2o
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init(min_mem_size='7G')
import numpy as np
from sklearn.linear_model import LinearRegression
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/notebookuser/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpgjdyho3z
  JVM stdout: /tmp/tmpgjdyho3z/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpgjdyho3z/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
Warning: Your H2O cluster version is too old (1 year, 5 months and 3 days)! Please download and install the latest version from http://h2o.ai/download/
H2O cluster uptime: 01 secs
H2O cluster timezone: Etc/GMT
H2O data parsing timezone: UTC
H2O cluster version: 3.22.0.1
H2O cluster version age: 1 year, 5 months and 3 days !!!
H2O cluster name: H2O_from_python_unknownUser_2q5dg4
H2O cluster total nodes: 1
H2O cluster free memory: 7 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.7.3 final
In [37]:
##
#### My List of Countries and Regions to train and represent data
my_train_list=[
        'Australia_New South Wales', 'Australia_Queensland',
        'Australia_South Australia', 'Australia_Victoria', 'Belgium_nan',
        'Cambodia_nan', 'Canada_British Columbia',
        'Canada_Ontario',
        'China_Anhui', 'China_Beijing',
        'China_Chongqing', 'China_Fujian', 'China_Gansu',
        'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
        'China_Hainan', 'China_Hebei', 'China_Heilongjiang', 'China_Henan',
        'China_Hubei', 'China_Hunan', 'China_Inner Mongolia',
        'China_Jiangsu', 'China_Jiangxi', 'China_Jilin', 'China_Liaoning',
        'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
        'China_Shandong', 'China_Shanghai', 'China_Shanxi',
        'China_Sichuan', 'China_Tianjin', 'China_Tibet', 'China_Xinjiang',
        'China_Yunnan', 'China_Zhejiang', 'Egypt_nan', 'Finland_nan',
        'France_nan', 'Germany_nan',
        'China_Hong Kong',
        'India_nan',
        'Italy_nan', 'Japan_nan', 
        'China_Macau',
        'Malaysia_nan',
        'Nepal_nan', 
        'Philippines_nan', 'Russia_nan', 'Singapore_nan',
        'Korea, South_nan', 'Spain_nan', 'Sri Lanka_nan', 'Sweden_nan',
        'Taiwan*_nan', 'Thailand_nan', 
        'United Arab Emirates_nan',
        'Vietnam_nan',
        'China_Hubei',
        'China_Anhui', 'China_Beijing',
        'China_Chongqing', 'China_Fujian', 'China_Gansu',
        'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
        'China_Hainan', 'China_Hebei', 'China_Heilongjiang','China_Henan','China_Hunan', 
        'China_Jiangsu', 'China_Jiangxi', 'China_Jilin', 'China_Liaoning',
        'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
        'China_Shandong', 'China_Shanghai', 'China_Shanxi',
        'China_Sichuan',  'China_Xinjiang',
        'China_Yunnan', 'China_Zhejiang', 
 ### JOAO - LIST of Countries - Start here
       'Andorra_nan', 'Morocco_nan',
        'Italy_nan',
        'United States_nan',
        'Portugal_nan',
        'Spain_nan',  
        'Netherlands_nan',
        'France_nan', 
        'Belgium_nan', 'Poland_nan',
        'India_nan',
        'United Kingdom_nan', 
        'Switzerland_nan',
        'Germany_nan',
        'Japan_nan'
  ### JOAO - LIST of Countries - Finish here
       ]
In [44]:
train=train.fillna(0) 

###train_df=train[train['date']<'2020-02-17']
###boots=train_df[train_df['date']>='2020-02-14'] # some bootstrap to give more weight for recent days
###train_df=train_df.append([boots[boots['date']>='2020-02-14']]*1000,ignore_index=True)
###train_df_hubei=train_df[train_df['region']=='China_Hubei']
###test=train[train['date']>='2020-02-17']
###test=test[test['date']<'2020-02-19']
###
### Joao - Training  progression - When growth happened 2020/03/18 to 2020/03/21
train_df=train[train['date']<'2020-03-21']
boots=train_df[train_df['date']>='2020-03-27'] # some bootstrap to give more weight for recent days
train_df=train_df.append([boots[boots['date']>='2020-03-18']]*1000,ignore_index=True)

### Train progression of the Virus ### In Country list or Spain only
region_to_train=my_train_list
train_df_v2=train_df[train_df['region'].isin(region_to_train)] # =='Spain_nan'] #
test=train[train['date']>='2020-03-21']
test=test[test['date']<'2020-03-30']
In [45]:
x_col=[#'region',
            '1_day_change', '3_day_change','7_day_change',
            '1_day_change_rate', 
            '3_day_change_rate',
            '7_day_change_rate', 
            'last_day', 'kalman_prediction','infected_rate', 'min', 'max'
          ]
In [46]:
x=train_df[x_col]
y=train_df['confirmed']
reg = LinearRegression().fit(x,y)

pred2=reg.predict(test[x_col]); pred2=pd.DataFrame(pred2); pred2=round(pred2)
pred2['confirmed']=test['confirmed'].values; pred2['date']=test['date'].values; pred2['region']=test['region'].values
pred2.iloc[:55]
Out[46]:
0 confirmed date region
0 39.0 24 2020-03-21 Afghanistan_nan
1 37.0 40 2020-03-22 Afghanistan_nan
2 60.0 40 2020-03-23 Afghanistan_nan
3 55.0 74 2020-03-24 Afghanistan_nan
4 105.0 84 2020-03-25 Afghanistan_nan
5 109.0 94 2020-03-26 Afghanistan_nan
6 121.0 110 2020-03-27 Afghanistan_nan
7 137.0 110 2020-03-28 Afghanistan_nan
8 132.0 120 2020-03-29 Afghanistan_nan
9 82.0 76 2020-03-21 Albania_nan
10 88.0 89 2020-03-22 Albania_nan
11 105.0 104 2020-03-23 Albania_nan
12 123.0 123 2020-03-24 Albania_nan
13 146.0 146 2020-03-25 Albania_nan
14 174.0 174 2020-03-26 Albania_nan
15 207.0 186 2020-03-27 Albania_nan
16 216.0 197 2020-03-28 Albania_nan
17 226.0 212 2020-03-29 Albania_nan
18 114.0 139 2020-03-21 Algeria_nan
19 180.0 201 2020-03-22 Algeria_nan
20 255.0 230 2020-03-23 Algeria_nan
21 280.0 264 2020-03-24 Algeria_nan
22 316.0 302 2020-03-25 Algeria_nan
23 353.0 367 2020-03-26 Algeria_nan
24 432.0 409 2020-03-27 Algeria_nan
25 472.0 454 2020-03-28 Algeria_nan
26 518.0 511 2020-03-29 Algeria_nan
27 141.0 88 2020-03-21 Andorra_nan
28 162.0 113 2020-03-22 Andorra_nan
29 209.0 133 2020-03-23 Andorra_nan
30 202.0 164 2020-03-24 Andorra_nan
31 200.0 188 2020-03-25 Andorra_nan
32 226.0 224 2020-03-26 Andorra_nan
33 269.0 267 2020-03-27 Andorra_nan
34 318.0 308 2020-03-28 Andorra_nan
35 365.0 334 2020-03-29 Andorra_nan
36 2.0 2 2020-03-21 Angola_nan
37 3.0 2 2020-03-22 Angola_nan
38 3.0 3 2020-03-23 Angola_nan
39 7.0 3 2020-03-24 Angola_nan
40 5.0 3 2020-03-25 Angola_nan
41 5.0 4 2020-03-26 Angola_nan
42 6.0 4 2020-03-27 Angola_nan
43 9.0 5 2020-03-28 Angola_nan
44 10.0 7 2020-03-29 Angola_nan
45 4.0 1 2020-03-21 Antigua and Barbuda_nan
46 4.0 1 2020-03-22 Antigua and Barbuda_nan
47 4.0 3 2020-03-23 Antigua and Barbuda_nan
48 9.0 3 2020-03-24 Antigua and Barbuda_nan
49 9.0 3 2020-03-25 Antigua and Barbuda_nan
50 9.0 7 2020-03-26 Antigua and Barbuda_nan
51 16.0 7 2020-03-27 Antigua and Barbuda_nan
52 15.0 7 2020-03-28 Antigua and Barbuda_nan
53 15.0 7 2020-03-29 Antigua and Barbuda_nan
54 159.0 158 2020-03-21 Argentina_nan
In [47]:
pred2.iloc[100:150]
Out[47]:
0 confirmed date region
100 268.0 259 2020-03-22 Australia_Queensland
101 306.0 319 2020-03-23 Australia_Queensland
102 377.0 397 2020-03-24 Australia_Queensland
103 469.0 443 2020-03-25 Australia_Queensland
104 510.0 493 2020-03-26 Australia_Queensland
105 559.0 555 2020-03-27 Australia_Queensland
106 623.0 625 2020-03-28 Australia_Queensland
107 699.0 656 2020-03-29 Australia_Queensland
108 65.0 67 2020-03-21 Australia_South Australia
109 86.0 100 2020-03-22 Australia_South Australia
110 130.0 134 2020-03-23 Australia_South Australia
111 168.0 170 2020-03-24 Australia_South Australia
112 210.0 170 2020-03-25 Australia_South Australia
113 196.0 235 2020-03-26 Australia_South Australia
114 285.0 257 2020-03-27 Australia_South Australia
115 295.0 287 2020-03-28 Australia_South Australia
116 331.0 299 2020-03-29 Australia_South Australia
117 14.0 16 2020-03-21 Australia_Tasmania
118 23.0 22 2020-03-22 Australia_Tasmania
119 31.0 28 2020-03-23 Australia_Tasmania
120 39.0 28 2020-03-24 Australia_Tasmania
121 36.0 36 2020-03-25 Australia_Tasmania
122 47.0 47 2020-03-26 Australia_Tasmania
123 61.0 47 2020-03-27 Australia_Tasmania
124 58.0 62 2020-03-28 Australia_Tasmania
125 79.0 66 2020-03-29 Australia_Tasmania
126 137.0 229 2020-03-21 Australia_Victoria
127 287.0 355 2020-03-22 Australia_Victoria
128 440.0 355 2020-03-23 Australia_Victoria
129 408.0 411 2020-03-24 Australia_Victoria
130 473.0 466 2020-03-25 Australia_Victoria
131 522.0 520 2020-03-26 Australia_Victoria
132 586.0 574 2020-03-27 Australia_Victoria
133 643.0 685 2020-03-28 Australia_Victoria
134 773.0 769 2020-03-29 Australia_Victoria
135 82.0 90 2020-03-21 Australia_Western Australia
136 117.0 120 2020-03-22 Australia_Western Australia
137 154.0 140 2020-03-23 Australia_Western Australia
138 172.0 175 2020-03-24 Australia_Western Australia
139 216.0 175 2020-03-25 Australia_Western Australia
140 204.0 231 2020-03-26 Australia_Western Australia
141 282.0 231 2020-03-27 Australia_Western Australia
142 263.0 278 2020-03-28 Australia_Western Australia
143 330.0 311 2020-03-29 Australia_Western Australia
144 2727.0 2814 2020-03-21 Austria_nan
145 3195.0 3582 2020-03-22 Austria_nan
146 4126.0 4474 2020-03-23 Austria_nan
147 5154.0 5283 2020-03-24 Austria_nan
148 6017.0 5588 2020-03-25 Austria_nan
149 6150.0 6909 2020-03-26 Austria_nan
In [48]:
train_h20 = h2o.H2OFrame(train_df)
###train_h20_hubei = h2o.H2OFrame(train_df_hubei) # different model for Hubei
#
### Joao - Italian Model
train_h20_v2 = h2o.H2OFrame(train_df_v2) # different model for V2 region ### Spain This time

training_columns = ['region','1_day_change', '3_day_change', '7_day_change', '1_day_change_rate', '3_day_change_rate',
                    '7_day_change_rate', 'last_day', 'kalman_prediction','infected_rate', 'min', 'max'
                   ]                 
# Output parameter train against input parameters
response_column = 'confirmed'

# model = H2ORandomForestEstimator(ntrees=300, max_depth=12)
# model.train(x=training_columns, y=response_column, training_frame=train_h20)

###model_hubei = H2ORandomForestEstimator(ntrees=300, max_depth=12)
###model_hubei.train(x=training_columns, y=response_column, training_frame=train_h20_hubei)

### Joao - Italian Model

model_v2 = H2ORandomForestEstimator(ntrees=500, max_depth=17)
model_v2.train(x=training_columns, y=response_column, training_frame=train_h20_v2)

test_h20 = h2o.H2OFrame(test)
#test_h20_hubei = h2o.H2OFrame(test_hubei)
/home/notebookuser/anaconda3/lib/python3.7/site-packages/h2o/utils/shared_utils.py:177: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
In [49]:
#model_hubei.varimp(True).iloc[:,:] # Feature importance for Hubei Model RF
### Joao -  Model V2
model_v2.varimp(True).iloc[:,:] # Feature importance for Hubei Model RF
Out[49]:
variable relative_importance scaled_importance percentage
0 last_day 3.153284e+13 1.000000 0.449465
1 region 1.832792e+13 0.581233 0.261244
2 7_day_change 7.284557e+12 0.231015 0.103833
3 3_day_change 3.837244e+12 0.121690 0.054696
4 1_day_change 2.272217e+12 0.072059 0.032388
5 7_day_change_rate 1.653151e+12 0.052426 0.023564
6 3_day_change_rate 1.651050e+12 0.052360 0.023534
7 min 1.301245e+12 0.041266 0.018548
8 max 1.167384e+12 0.037021 0.016640
9 1_day_change_rate 6.823608e+11 0.021640 0.009726
10 infected_rate 4.464844e+11 0.014159 0.006364
In [50]:
# # Model performance
# performance = model_hubei.model_performance(test_data=test_h20_hubei)
# # Model Create Predictions
# pred=model_hubei.predict(test_h20_hubei);pred=pred.as_data_frame(); pred=round(pred)
# #pred['daily_outcome']=test['daily_outcome'].values
# pred['confirmed']=test_hubei['confirmed'].values
# pred['date']=test_hubei['date'].values
# pred['region']=test_hubei['region'].values

# pred2=model.predict(test_h20);pred2=pred2.as_data_frame(); pred2=round(pred2)
# pred2['confirmed']=test['confirmed'].values
# pred2['date']=test['date'].values
# pred2['region']=test['region'].values
# pred=pred.append(pred2)

## Joao - Model Predictions - Country_nan _v2
performance = model_v2.model_performance(test_data=test_h20)
# # Model Create Predictions
pred=model_v2.predict(test_h20);pred=pred.as_data_frame(); pred=round(pred)
# #pred['daily_outcome']=test['daily_outcome'].values
pred['confirmed']=test['confirmed'].values
pred['date']=test['date'].values
pred['region']=test['region'].values
drf prediction progress: |████████████████████████████████████████████████| 100%
/home/notebookuser/anaconda3/lib/python3.7/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'region' has levels not trained on: [Afghanistan_nan, Albania_nan, Algeria_nan, Angola_nan, Antigua and Barbuda_nan, Argentina_nan, Armenia_nan, Australia_Australian Capital Territory, Australia_Northern Territory, Australia_Tasmania, Australia_Western Australia, Austria_nan, Azerbaijan_nan, Bahamas_nan, Bahrain_nan, Bangladesh_nan, Barbados_nan, Belarus_nan, Belize_nan, Benin_nan, Bhutan_nan, Bolivia_nan, Bosnia and Herzegovina_nan, Brazil_nan, Brunei_nan, Bulgaria_nan, Burkina Faso_nan, Burma_nan, Cabo Verde_nan, Cameroon_nan, Canada_Alberta, Canada_Diamond Princess, Canada_Grand Princess, Canada_Manitoba, Canada_New Brunswick, Canada_Newfoundland and Labrador, Canada_Northwest Territories, Canada_Nova Scotia, Canada_Prince Edward Island, Canada_Quebec, Canada_Recovered, Canada_Saskatchewan, Canada_Yukon, Central African Republic_nan, Chad_nan, Chile_nan, Colombia_nan, Congo (Brazzaville)_nan, Congo (Kinshasa)_nan, Costa Rica_nan, Cote d'Ivoire_nan, Croatia_nan, Cuba_nan, Cyprus_nan, Czechia_nan, Denmark_Faroe Islands, Denmark_Greenland, Denmark_nan, Diamond Princess_nan, Djibouti_nan, Dominica_nan, Dominican Republic_nan, Ecuador_nan, El Salvador_nan, Equatorial Guinea_nan, Eritrea_nan, Estonia_nan, Eswatini_nan, Ethiopia_nan, Fiji_nan, France_French Guiana, France_French Polynesia, France_Guadeloupe, France_Martinique, France_Mayotte, France_New Caledonia, France_Reunion, France_Saint Barthelemy, France_St Martin, Gabon_nan, Gambia_nan, Georgia_nan, Ghana_nan, Greece_nan, Grenada_nan, Guatemala_nan, Guinea-Bissau_nan, Guinea_nan, Guyana_nan, Haiti_nan, Holy See_nan, Honduras_nan, Hungary_nan, Iceland_nan, Indonesia_nan, Iran_nan, Iraq_nan, Ireland_nan, Israel_nan, Jamaica_nan, Jordan_nan, Kazakhstan_nan, Kenya_nan, Kosovo_nan, Kuwait_nan, Kyrgyzstan_nan, Laos_nan, Latvia_nan, Lebanon_nan, Liberia_nan, Libya_nan, Liechtenstein_nan, Lithuania_nan, Luxembourg_nan, MS Zaandam_nan, Madagascar_nan, Maldives_nan, Mali_nan, Malta_nan, Mauritania_nan, Mauritius_nan, Mexico_nan, Moldova_nan, Monaco_nan, Mongolia_nan, Montenegro_nan, Mozambique_nan, Namibia_nan, Netherlands_Aruba, Netherlands_Curacao, Netherlands_Sint Maarten, New Zealand_nan, Nicaragua_nan, Niger_nan, Nigeria_nan, North Macedonia_nan, Norway_nan, Oman_nan, Pakistan_nan, Panama_nan, Papua New Guinea_nan, Paraguay_nan, Peru_nan, Qatar_nan, Romania_nan, Rwanda_nan, Saint Kitts and Nevis_nan, Saint Lucia_nan, Saint Vincent and the Grenadines_nan, San Marino_nan, Saudi Arabia_nan, Senegal_nan, Serbia_nan, Seychelles_nan, Slovakia_nan, Slovenia_nan, Somalia_nan, South Africa_nan, Sudan_nan, Suriname_nan, Syria_nan, Tanzania_nan, Timor-Leste_nan, Togo_nan, Trinidad and Tobago_nan, Tunisia_nan, Turkey_nan, Uganda_nan, Ukraine_nan, United Kingdom_Anguilla, United Kingdom_Bermuda, United Kingdom_British Virgin Islands, United Kingdom_Cayman Islands, United Kingdom_Channel Islands, United Kingdom_Gibraltar, United Kingdom_Isle of Man, United Kingdom_Montserrat, United Kingdom_Turks and Caicos Islands, Uruguay_nan, Uzbekistan_nan, Venezuela_nan, West Bank and Gaza_nan, Zambia_nan, Zimbabwe_nan]
  warnings.warn(w)

Correlation Matrix And Temperature

In [51]:
from string import ascii_letters
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
# Compute the correlation matrix
corr = train.iloc[:,2:].corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.9, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
print ('Correlation Matrix')
Correlation Matrix
In [52]:
print('Correlation To Confirmed') 
print (corr.confirmed)
Correlation To Confirmed
confirmed            1.000000
1_day_change         0.640506
3_day_change         0.657480
7_day_change         0.685247
1_day_change_rate    0.060506
3_day_change_rate    0.036342
7_day_change_rate    0.023947
last_day             0.933563
kalman_prediction         NaN
population           0.040470
infected_rate        0.337374
min                  0.066922
max                  0.091031
Name: confirmed, dtype: float64
In [53]:
import matplotlib.pyplot as plt
p=train[['date','region','min','max']].set_index('date')

p=p[p['region']=='China_Hubei']
p.iloc[:,:].plot(marker='*',figsize=(12,4),color=['#19303f','#cccc00']).set_title('Daily Min/Max Temperature - Hubei',fontdict={'fontsize': 20})

## JOAO - Temp. Teast Italy - Data Supply finishes 13/03/2020
p=train[['date','region','min','max']].set_index('date')
p=p[p['region']=='Italy_nan']
p.iloc[:,:].plot(marker='*',figsize=(12,4),color=['#19303f','#cccc00']).set_title('Daily Min/Max Temperature - Italy',fontdict={'fontsize': 20})
#
#
Out[53]:
Text(0.5, 1.0, 'Daily Min/Max Temperature - Italy')
In [54]:
avg_temp=train[['region','confirmed','min','max']]  # from 17-02-20 to 16-03-2020
avg_temp=avg_temp.groupby(by='region').mean()
avg_temp=avg_temp.sort_values('confirmed',ascending=False)
print( 'Most infected Areas Avg Temperature')
print(avg_temp.iloc[:100,1:])
Most infected Areas Avg Temperature
                                min        max
region                                        
China_Hubei                6.815942  13.857971
Italy_nan                  6.846377  15.323188
United States_nan          0.000000   0.000000
Spain_nan                  5.874286  14.737143
Germany_nan                1.457143   4.971429
Iran_nan                   5.015493  14.521127
France_nan                 2.565217   8.868116
Korea, South_nan           0.000000   0.000000
United Kingdom_nan         0.000000   0.000000
Switzerland_nan           -1.498667   1.658667
Netherlands_nan            0.000000   0.000000
China_Guangdong           15.046377  25.207246
China_Henan                2.902899   9.994203
China_Zhejiang             8.621739  15.188406
Austria_nan               -2.880000   1.018571
Belgium_nan                2.901429   7.342857
China_Hunan                9.359420  15.511594
China_Anhui                5.692754  12.076812
China_Jiangxi             10.410145  17.014493
Norway_nan                -1.758571  -0.982857
China_Shandong            -0.375362   9.228986
Sweden_nan                 0.221429   3.605714
Turkey_nan                 0.000000   0.000000
China_Jiangsu              4.866667  12.391304
Portugal_nan               5.874286  14.737143
China_Chongqing            7.376812  13.242029
Diamond Princess_nan       0.000000   0.000000
China_Sichuan              1.266667  13.475362
Brazil_nan                13.335714  17.647143
Japan_nan                  6.111429  11.238571
...                             ...        ...
Slovenia_nan               0.000000   0.000000
Bahrain_nan               11.550000  15.410000
Canada_British Columbia    3.392405   8.403797
China_Gansu               -7.685507   8.172464
Estonia_nan                0.000000   0.000000
Egypt_nan                 11.185714  20.211429
Peru_nan                   0.000000   0.000000
Panama_nan                 0.000000   0.000000
Iraq_nan                   7.147826  10.424638
China_Jilin              -14.826087  -1.707246
Mexico_nan                 0.000000   0.000000
Croatia_nan                0.501429   6.620000
Australia_Victoria        15.535443  22.273418
Argentina_nan              0.000000   0.000000
Colombia_nan               0.000000   0.000000
Australia_Queensland      22.315190  29.875949
Canada_Alberta             0.000000   0.000000
United Arab Emirates_nan  18.843836  26.616438
Lebanon_nan                2.495714   8.400000
Serbia_nan                 0.000000   0.000000
Dominican Republic_nan     0.000000   0.000000
China_Inner Mongolia     -16.494203  -0.718841
China_Xinjiang            -6.373913   3.036232
China_Ningxia             -6.020290   9.652174
Taiwan*_nan                0.000000   0.000000
Kuwait_nan                 8.648571  14.760000
Algeria_nan                6.121429  17.528571
Armenia_nan                0.000000   0.000000
San Marino_nan             0.000000   0.000000
Bulgaria_nan               0.000000   0.000000

[100 rows x 2 columns]

Kalman X Days Ahead Prediction

In [55]:
%%R

#install.packages('reshape')
NULL
In [56]:
%%R
require(pracma)
require(Metrics)
require(readr)
library(reshape)
all<- read_csv("/home/notebookuser/notebooks/covid19/korean/ts_r.csv")
all$X1<-NULL
### JOAO
#### for (i in 1:30) { # Set i days prediction
#####for (i in 1:45) { # Set i days prediction
for (i in 1:75) { # Set i days prediction    
  if( i>1) {all<-all_new}
  date<-all[,1]
  date[nrow(date) + 1,1] <-all[nrow(all),1]+1
  pred_all<-NULL
  for (n in 2:ncol(all)-1) {
    Y<-ts(data = all[n+1], start = 1, end =nrow(all)+1)  
    sig_w<-0.01
    w<-sig_w*randn(1,100) # acceleration which denotes the fluctuation (Q/R) rnorm(100, mean = 0, sd = 1)
    sig_v<-0.01
    v<-sig_v*randn(1,100)  
    t<-0.45
    phi<-matrix(c(1,0,t,1),2,2)
    gama<-matrix(c(0.5*t^2,t),2,1)
    H<-matrix(c(1,0),1,2)
    #Kalman
    x0_0<-p0_0<-matrix(c(0,0),2,1)
    p0_0<-matrix(c(1,0,0,1),2,2)
    Q<-0.01
    R<-0.01
    X<-NULL
    X2<-NULL
    pred<-NULL
    for (i in 0:nrow(all)) {
      namp <-paste("p", i+1,"_",i, sep = "")
      assign(namp, phi%*%(get(paste("p", i,"_",i, sep = "")))%*%t(phi)+gama%*%Q%*%t(gama))
      namk <- paste("k", i+1, sep = "")
      assign(namk,get(paste("p", i+1,"_",i, sep = ""))%*%t(H)%*%(1/(H%*%get(paste("p", i+1,"_",i, sep = ""))%*%t(H)+R)))
      namx <- paste("x", i+1,"_",i, sep = "")
      assign(namx,phi%*%get(paste("x", i,"_",i, sep = "")))
      namE <- paste("E", i+1, sep = "")
      assign(namE,Y[i+1]-H%*%get(paste("x", i+1,"_",i, sep = "")))
      namx2 <- paste("x", i+1,"_",i+1, sep = "")
      assign(namx2,get(paste("x", i+1,"_",i, sep = ""))+get(paste("k", i+1, sep = ""))%*%get(paste("E", i+1, sep = "")))
      namp2 <- paste("p", i+1,"_",i+1, sep = "")
      assign(namp2,(p0_0-get(paste("k", i+1, sep = ""))%*%H)%*%get(paste("p", i+1,"_",i, sep = "")))
      X<-rbind(X,get(paste("x", i+1,"_",i,sep = ""))[1])
      X2<-rbind(X2,get(paste("x", i+1,"_",i,sep = ""))[2])
      if(i>2){
        remove(list=(paste("p", i-1,"_",i-2, sep = "")))
        remove(list=(paste("k", i-1, sep = "")))
        remove(list=(paste("E", i-1, sep = "")))
        remove(list=(paste("p", i-2,"_",i-2, sep = "")))
        remove(list=(paste("x", i-1,"_",i-2, sep = "")))
        remove(list=(paste("x", i-2,"_",i-2, sep = "")))}
    } 
    pred<-NULL
    pred<-cbind(Y,X,round(X2,4))
    pred<-as.data.frame(pred)
    pred$region<-colnames(all[,n+1])
    pred$date<-date$date
    pred$actual<-rbind(0,(cbind(pred[2:nrow(pred),1])/pred[1:nrow(pred)-1,1]-1)*100)
    pred$predict<-rbind(0,(cbind(pred[2:nrow(pred),2])/pred[1:nrow(pred)-1,2]-1)*100)
    pred$pred_rate<-(pred$X/pred$Y-1)*100
    pred$X2_change<-rbind(0,(cbind(pred[2:nrow(pred),3]-pred[1:nrow(pred)-1,3])))
    pred_all<-rbind(pred_all,pred)
  }
  pred_all<-cbind(pred_all[,4:5],pred_all[,1:3])
  names(pred_all)[5]<-"X2"
  pred_all<-pred_all[,1:5]
       
pred_all_today=pred_all[with( pred_all, order(region, date)), ]
all_new=all
#all_new[nrow(all_new),1]<-all_new[nrow(all),1]+1
temp<-with(pred_all_today, pred_all_today[date == all[nrow(all),1]+1, ])
temp<-cbind(temp[,1:2],temp[,4])
temp2<-reshape(temp, direction = "wide", idvar = "date", timevar = "region")
rand_num<-runif(ncol(temp2)-1, 0.9, 1.05)
temp2[,2:ncol(temp2)]<-temp2[,2:ncol(temp2)]*rand_num
colnames(temp2)=colnames(all_new)
all_new<-rbind(all_new,temp2)
all_new[,2:ncol(all_new)]<-round(all_new[,2:ncol(all_new)])
for (i in 2:ncol(all_new)) {
  all_new[nrow(all_new),i]=max(all_new[nrow(all_new)-1,i],all_new[nrow(all_new),i])}
}
WARNING:rpy2.rinterface_lib.callbacks:R[write to console]: Parsed with column specification:
cols(
  .default = col_double(),
  date = col_date(format = "")
)

WARNING:rpy2.rinterface_lib.callbacks:R[write to console]: See spec(...) for full column specifications.

In [57]:
all_new=%R all_new
In [58]:
all_new['date']=pd.to_datetime(all_new['date'],unit='d')
In [59]:
# Select regions
region=['date',
        'Australia_New South Wales', 'Australia_Queensland',
        'Australia_South Australia', 'Australia_Victoria', 'Belgium_nan',
        'Cambodia_nan', 'Canada_British Columbia',
        'Canada_Ontario',
        'China_Anhui', 'China_Beijing',
        'China_Chongqing', 'China_Fujian', 'China_Gansu',
        'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
        'China_Hainan', 'China_Hebei', 'China_Heilongjiang', 'China_Henan',
        'China_Hubei', 'China_Hunan', 'China_Inner Mongolia',
        'China_Jiangsu', 'China_Jiangxi', 'China_Jilin', 'China_Liaoning',
        'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
        'China_Shandong', 'China_Shanghai', 'China_Shanxi',
        'China_Sichuan', 'China_Tianjin', 'China_Tibet', 'China_Xinjiang',
        'China_Yunnan', 'China_Zhejiang', 'Egypt_nan', 'Finland_nan',
        'France_nan', 'Germany_nan',
        'China_Hong Kong',
        'India_nan',
        'Italy_nan', 'Japan_nan', 
        'China_Macau',
        'Malaysia_nan',
        'Nepal_nan', 
        'Philippines_nan', 'Russia_nan', 'Singapore_nan',
        'Korea, South_nan', 'Spain_nan', 'Sri Lanka_nan', 'Sweden_nan',
        'Taiwan*_nan', 'Thailand_nan', 
        'United Arab Emirates_nan',
#       'Unites States_Boston, MA',
#        'Unites States_Chicago, IL', 'Unites States_Los Angeles, CA',
#        'Unites States_Madison, WI', 'Unites States_Orange, CA',
#        'Unites States_San Antonio, TX', 'Unites States_San Benito, CA',
#        'Unites States_San Diego County, CA',
#        'Unites States_Santa Clara, CA', 'Unites States_Seattle, WA',
#        'Unites States_Tempe, AZ',
        'Vietnam_nan',
#       ]
        'China_Hubei',
        'China_Anhui', 'China_Beijing',
        'China_Chongqing', 'China_Fujian', 'China_Gansu',
        'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
        'China_Hainan', 'China_Hebei', 'China_Heilongjiang','China_Henan','China_Hunan', 
        'China_Jiangsu', 'China_Jiangxi', 'China_Jilin', 'China_Liaoning',
        'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
        'China_Shandong', 'China_Shanghai', 'China_Shanxi',
        'China_Sichuan',  'China_Xinjiang',
        'China_Yunnan', 'China_Zhejiang', 
 #     'Hong Kong_Hong Kong','Others_Diamond Princess cruise ship'
 ### JOAO - LIST of Countries - Start here
       'Andorra_nan', 'Morocco_nan',
 #       'Italy_nan',
        'United States_nan',
        'Portugal_nan',
 #       'Spain_nan',  
        'Netherlands_nan',
  #      'France_nan', 
        'Belgium_nan', 'Poland_nan',
    #     'India_nan',
        'United Kingdom_nan', 
        'Switzerland_nan',
    #    'Germany_nan',
  ### JOAO - LIST of Countries - Finish here       
  #      'Japan_nan'
       ]
p_kalman=all_new[region]
#p=all_new
#p.iloc[len(p)-1,2]=None
p_kalman=p_kalman.set_index(['date'])
p_kalman.iloc[:,:].plot(marker='o',figsize=(24,14)).set_title('Kalman Prediction')

#p_kalman2=all_new[['date','China_Hubei']]
p_kalman2=all_new[['date','Spain_nan']] ## Joao
p_kalman2=p_kalman2.set_index(['date'])
p_kalman2.iloc[:,:].plot(marker='o',figsize=(24,14)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman2.columns[0]))
Out[59]:
Text(0.5, 1.0, 'Kalman Prediction - Select Country/Region to Change - Spain_nan')
In [61]:
### Joao - Dynamic print
#print(region[:])
for i in range(1,len(region)):
    country_print=region[i]
    #print("here:"+country_print)
    p_kalman_rg=all_new[['date',country_print]]
    p_kalman_rg=p_kalman_rg.set_index(['date'])
    p_kalman_rg.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman_rg.columns[0]))
    
In [62]:
### JOAO - Old code
p_kalman3=all_new[['date','Italy_nan']]
p_kalman3=p_kalman3.set_index(['date'])
p_kalman3.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman3.columns[0]))

p_kalman4=all_new[['date','United States_nan']]
p_kalman4=p_kalman4.set_index(['date'])
p_kalman4.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman4.columns[0]))

p_kalman5=all_new[['date','Germany_nan']]
p_kalman5=p_kalman5.set_index(['date'])
p_kalman5.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman5.columns[0]))

p_kalman6=all_new[['date','France_nan']]
p_kalman6=p_kalman6.set_index(['date'])
p_kalman6.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman6.columns[0]))

p_kalman7=all_new[['date','Netherlands_nan']]
p_kalman7=p_kalman7.set_index(['date'])
p_kalman7.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman7.columns[0]))

p_kalman8=all_new[['date','Portugal_nan']]
p_kalman8=p_kalman8.set_index(['date'])
p_kalman8.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman8.columns[0]))

p_kalman9=all_new[['date','United Kingdom_nan']]
p_kalman9=p_kalman9.set_index(['date'])
p_kalman9.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman9.columns[0]))

p_kalman10=all_new[['date','Poland_nan']]
p_kalman10=p_kalman10.set_index(['date'])
p_kalman10.iloc[:,:].plot(marker='o',figsize=(16,8)).set_title('Kalman Prediction - Select Country/Region to Change - {}'.format(p_kalman10.columns[0]))
Out[62]:
Text(0.5, 1.0, 'Kalman Prediction - Select Country/Region to Change - Poland_nan')
In [63]:
 t.to_csv(r't_confirmed_global.csv')
In [64]:
all_new.to_csv(r'prediction_kalman_filter_global.csv')

Iterative Regression

In [65]:
t_iter=all_new.set_index(['date'])
t_iter=t_iter.stack().reset_index(name='confirmed')
t_iter.columns=['date', 'region','confirmed']
t_iter['date']=pd.to_datetime(t_iter['date'] ,errors ='coerce')
t_iter=t_iter.sort_values(['region', 'date'])

t_iter=t_iter.reset_index(drop=True)
for i in range(1,len(t_iter)+1):
  if(t_iter.iloc[i,1] is not t_iter.iloc[i-1,1]):
    t_iter.loc[len(t_iter)+1] = [t_iter.iloc[i-1,0]+ pd.DateOffset(1),t_iter.iloc[i-1,1], 0] 
t_iter=t_iter.sort_values(['region', 'date'])
t_iter=t_iter.reset_index(drop=True)

### Joao - Fix - RuntimeWarning: divide by zero encountered in double_scalars
#
t_iter['1_day_change']=t_iter['3_day_change']=t_iter['7_day_change']=t_iter['1_day_change_rate']=t_iter['3_day_change_rate']=t_iter['7_day_change_rate']=t_iter['last_day']=0
for i in range(1,len(t_iter)):
  if(t_iter.iloc[i,1] is t_iter.iloc[i-2,1]):
    t_iter.iloc[i,3]=t_iter.iloc[i-1,2]-t_iter.iloc[i-2,2]
    t_iter.iloc[i,6]=((t_iter.iloc[i-1,2] +1)/(t_iter.iloc[i-2,2]-1 +1))*100
    t_iter.iloc[i,9]=t_iter.iloc[i-1,2]
  if(t_iter.iloc[i,1] is t_iter.iloc[i-4,1]):
    t_iter.iloc[i,4]=t_iter.iloc[i-1,2]-t_iter.iloc[i-4,2]
    t_iter.iloc[i,7]=((t_iter.iloc[i-1,2] +1)/(t_iter.iloc[i-4,2]-1 +1))*100
  if(t_iter.iloc[i,1] is t_iter.iloc[i-8,1]):
    t_iter.iloc[i,5]=t_iter.iloc[i-1,2]-t_iter.iloc[i-8,2]
    t_iter.iloc[i,8]=((t_iter.iloc[i-1,2] +1)/(t_iter.iloc[i-8,2]-1 +1))*100
t_iter=t_iter.fillna(0)  

# t_iter=t_iter.merge(temp[['date','region', 'X']],how='left',on=['date','region'])
# t_iter=t_iter.rename(columns = {'X':'kalman_prediction'}) 
t_iter=t_iter.replace([np.inf, -np.inf], 0)
t_iter['kalman_prediction']=round(t_iter['confirmed'])

test_iter=t_iter.merge(confirmed[['region',' Population ']],how='left',on='region')
test_iter=test_iter.rename(columns = {' Population ':'population'})
test_iter['population']=test_iter['population'].str.replace(r" ", '')
test_iter['population']=test_iter['population'].str.replace(r",", '')
test_iter['population']=test_iter['population'].fillna(1)
test_iter['population']=test_iter['population'].astype('int32')
## Joao - Fix Divid By Zero
#test_iter['infected_rate'] =test_iter['last_day']/test_iter['population']*10000
test_iter['infected_rate'] =(test_iter['last_day']+1)/(test_iter['population']+1)*10000
#
test_iter=test_iter.merge(w,how='left',on=['date','region'])
#test_iter=test_iter.sort_values(['region', 'date'])

test_iter_temp=test_iter[np.isnan(test_iter['min'])]
test_iter_temp=test_iter_temp.drop(columns=['min', 'max'])
test_iter_temp=test_iter_temp.merge(w_forecast,how='left',on=['date','region'])
test_iter=test_iter.dropna()
test_iter=test_iter.append(test_iter_temp)
test_iter=test_iter.sort_values(['region', 'date'])
### fill missing weather 
for i in range(0,len(test_iter)):
  if(np.isnan(test_iter.iloc[i,13])):
    if(test_iter.iloc[i,1] is test_iter.iloc[i-1,1]):
      test_iter.iloc[i,13]=test_iter.iloc[i-1,13]+abs(test_iter.iloc[i-1,13]*.01)
      test_iter.iloc[i,14]=test_iter.iloc[i-1,14]+abs(test_iter.iloc[i-1,14]*.01)
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:20: RuntimeWarning: divide by zero encountered in double_scalars
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: RuntimeWarning: divide by zero encountered in double_scalars
/home/notebookuser/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:27: RuntimeWarning: divide by zero encountered in double_scalars
In [66]:
test_iter=test_iter.fillna(0) 
test_iter[test_iter.isnull().any(axis=1)]
Out[66]:
date region confirmed 1_day_change 3_day_change 7_day_change 1_day_change_rate 3_day_change_rate 7_day_change_rate last_day kalman_prediction population infected_rate min max
In [72]:
test_iter[35620:35640]
Out[72]:
date region confirmed 1_day_change 3_day_change 7_day_change 1_day_change_rate 3_day_change_rate 7_day_change_rate last_day kalman_prediction population infected_rate min max
33232 2020-03-21 United Kingdom_nan 5018.0 1294.0 2033.0 3185.0 148.159167 204.307692 499.248120 3983.0 5018.0 66440000 0.599639 0.0 0.0
33233 2020-03-22 United Kingdom_nan 5683.0 1035.0 2392.0 3878.0 126.010545 191.127190 440.263158 5018.0 5683.0 66440000 0.755418 0.0 0.0
33234 2020-03-23 United Kingdom_nan 6650.0 665.0 2994.0 4543.0 113.272220 211.379695 498.596491 5683.0 6650.0 66440000 0.855509 0.0 0.0
33235 2020-03-24 United Kingdom_nan 8077.0 967.0 2667.0 5107.0 117.033257 166.984685 431.043422 6650.0 8077.0 66440000 1.001054 0.0 0.0
33236 2020-03-25 United Kingdom_nan 9529.0 1427.0 3059.0 6127.0 121.473684 160.980470 414.256410 8077.0 9529.0 66440000 1.215834 0.0 0.0
33237 2020-03-26 United Kingdom_nan 11658.0 1452.0 3846.0 6903.0 117.989352 167.693120 362.909368 9529.0 11658.0 66440000 1.434377 0.0 0.0
33238 2020-03-27 United Kingdom_nan 14543.0 2129.0 5008.0 8969.0 122.352818 175.323308 433.581257 11658.0 14543.0 66440000 1.754816 0.0 0.0
33239 2020-03-28 United Kingdom_nan 17089.0 2885.0 6466.0 10560.0 124.755533 180.066857 365.151896 14543.0 17089.0 66440000 2.189043 0.0 0.0
33240 2020-03-29 United Kingdom_nan 19522.0 2546.0 7560.0 12071.0 117.513580 179.347256 340.573934 17089.0 19522.0 66440000 2.572246 0.0 0.0
33241 2020-03-30 United Kingdom_nan 19522.0 2433.0 7864.0 13839.0 114.243080 167.464402 343.533345 19522.0 19522.0 66440000 2.938441 0.0 0.0
33242 2020-03-31 United Kingdom_nan 22928.0 0.0 4979.0 12872.0 100.005122 134.243279 293.578947 19522.0 22928.0 66440000 2.938441 0.0 0.0
33243 2020-04-01 United Kingdom_nan 22928.0 3406.0 5839.0 14851.0 117.452105 134.174030 283.880154 22928.0 22928.0 66440000 3.451084 0.0 0.0
33244 2020-04-02 United Kingdom_nan 26768.0 0.0 3406.0 13399.0 100.004361 117.452105 240.623360 22928.0 26768.0 66440000 3.451084 0.0 0.0
33245 2020-04-03 United Kingdom_nan 27129.0 3840.0 7246.0 15110.0 116.752442 137.122221 229.619146 26768.0 27129.0 66440000 4.029049 0.0 0.0
33246 2020-04-04 United Kingdom_nan 30494.0 361.0 4201.0 12586.0 101.352361 118.326936 186.550230 27129.0 30494.0 66440000 4.083383 0.0 0.0
33247 2020-04-05 United Kingdom_nan 31953.0 3365.0 7566.0 13405.0 112.407387 133.003315 178.448125 30494.0 31953.0 66440000 4.589855 0.0 0.0
33248 2020-04-06 United Kingdom_nan 33630.0 1459.0 5185.0 12431.0 104.787827 119.373879 163.682000 31953.0 33630.0 66440000 4.809452 0.0 0.0
33249 2020-04-07 United Kingdom_nan 33630.0 1677.0 6501.0 14108.0 105.251463 123.966973 172.272308 33630.0 33630.0 66440000 5.061860 0.0 0.0
33250 2020-04-08 United Kingdom_nan 36842.0 0.0 3136.0 10702.0 100.002974 110.287270 146.680914 33630.0 36842.0 66440000 5.061860 0.0 0.0
33251 2020-04-09 United Kingdom_nan 37615.0 3212.0 4889.0 13914.0 109.553970 115.303727 160.689986 36842.0 37615.0 66440000 5.545304 0.0 0.0
In [73]:
### JOAO - ERROR - ValueError: Index contains duplicate entries, cannot reshape
pred=reg.predict(test_iter[x_col]); pred=pd.DataFrame(pred); pred.columns = ['prediction'];pred=round(pred)
pred['confirmed']=test_iter['confirmed'].values; pred['date']=test_iter['date'].values; pred['region']=test_iter['region'].values

for i in range(1,len(pred)):
    if(pred.iloc[i,3] is pred.iloc[i-1,3]):
      if(pred.iloc[i,0]<pred.iloc[i-1,1]):
        pred.iloc[i,0]=pred.iloc[i-1,1]
### JOAO - Drop Duplicates
pred=pred.drop_duplicates(subset=['date','region'], keep='last', inplace=False)        

pred=pred.pivot(index='date',columns='region',values='prediction') # pivot pred df
In [76]:
region=[
  #     'China_Anhui', 'China_Beijing',
  #     'China_Chongqing', 'China_Fujian', 'China_Gansu',
  #     'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
  #     'China_Hainan', 'China_Hebei', 'China_Heilongjiang', 'China_Henan','China_Hunan', 
  #     'China_Jiangsu', 'China_Jiangxi', 'China_Jilin', 'China_Liaoning',
  #     'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
  #     'China_Shandong', 'China_Shanghai', 'China_Shanxi',
  #     'China_Sichuan',  'China_Xinjiang',
  #     'China_Yunnan', 'China_Zhejiang', 
  #      'Hong Kong_Hong Kong','Japan_nan','Others_Diamond Princess cruise ship'
  #  
 ### JOAO - LIST of Countries - Start here
 ###    'Andorra_nan', 'Morocco_nan',
        'China_Hubei',
        'Italy_nan',
    #    'United States_nan',
        'Portugal_nan', 'Spain_nan',
        'Germany_nan', 'Switzerland_nan', 
   #     'Netherlands_nan',
        'France_nan',
        'Iran_nan',
        'Belgium_nan', 'Poland_nan',
        'United Kingdom_nan', 
        'Russia_nan', 'India_nan',
        'Australia_New South Wales',
        'Sweden_nan',
      #  'Singapore_nan','China_Hong Kong',
      #  'Taiwan*_nan',
        #'Korea, South_nan',
       # 'China_Macau',
       # 'India_nan',
  ### JOAO - LIST of Countries - Finish here     
       ]

# region=['China_Anhui', 'China_Beijing',
#        'China_Chongqing', 'China_Fujian', 'China_Gansu',
#        'China_Guangdong', 'China_Guangxi', 'China_Guizhou',
#        'China_Hainan', 'China_Hebei', 'China_Heilongjiang', 'China_Henan',
#        'China_Jiangsu', 'China_Jiangxi', 'China_Liaoning',
#        'China_Ningxia', 'China_Qinghai', 'China_Shaanxi',
#        'China_Shandong', 'China_Shanghai', 'China_Shanxi',
      
#        'China_Yunnan', 'China_Zhejiang' 
#      ]

### Joao - ERROR - KeyError: "None of [Index(['Portugal_nan', 'Andorra_nan', 'Spain_nan', 'Belgium_nan',\n       'Morocco_nan', 'Netherlands_Netherlands', 'France_France', 'Poland_nan',\n       'United Kingdom_United Kingdom', 'Switzerland_nan', 'Germany_nan',\n       'Italy_nan'],\n      dtype='object')] are in the [columns]"     

p=pred[region]
p.plot(marker='*',figsize=(24,14),title ='Major Areas Prediction')
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3692124cc0>
In [77]:
### Joao - Error - KeyError: 'Italy_nan'
p2=pred['Italy_nan']
p2.plot(marker='o',figsize=(16,8),title ='Italy Prediction - Confirmed Cases Covid-19')
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f36917d1e10>
In [78]:
p3=pred['Spain_nan']
p3.plot(marker='o',figsize=(16,8),title ='Spain Prediction - Confirmed Cases Covid-19')
Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3691c872b0>
In [79]:
p4=pred['Switzerland_nan']
p4.plot(marker='o',figsize=(16,8),title ='Switzerland Prediction - Confirmed Cases Covid-19')
Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f36912d6630>
In [80]:
p5=pred['Germany_nan']
p5.plot(marker='o',figsize=(16,8),title ='Germany Prediction - Confirmed Cases Covid-19')
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3691d77f28>
In [81]:
p6=pred['Portugal_nan']
p6.plot(marker='o',figsize=(16,8),title ='Portugal Prediction - Confirmed Cases Covid-19')
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368b7f2438>
In [82]:
p8=pred['United Kingdom_nan']
p8.plot(marker='o',figsize=(16,8),title ='United Kingdom Prediction - Confirmed Cases Covid-19')
Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368b784cf8>
In [83]:
p9=pred['France_nan']
p9.plot(marker='o',figsize=(16,8),title ='France Prediction - Confirmed Cases Covid-19')
Out[83]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368acc1c88>
In [84]:
p10=pred['United States_nan']
p10.plot(marker='o',figsize=(25,12),title ='United States_nan  Prediction - Confirmed Cases Covid-19')
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368b7849b0>
In [85]:
p11=pred['China_Hubei']
p11.plot(marker='o',figsize=(16,8),title ='China - Hubei  Prediction - Confirmed Cases Covid-19')
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368ac2a5f8>
In [86]:
p12=pred['Iran_nan']
p12.plot(marker='o',figsize=(16,8),title ='China - Hubei  Prediction - Confirmed Cases Covid-19')
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368abaaa90>
In [87]:
p13=pred['Sweden_nan']
p13.plot(marker='o',figsize=(16,8),title ='China - Hubei  Prediction - Confirmed Cases Covid-19')
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f36909b5898>
In [88]:
p13=pred['Russia_nan']
p13.plot(marker='o',figsize=(16,8),title ='China - Hubei  Prediction - Confirmed Cases Covid-19')
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3690fb6d68>
In [89]:
pv1=p #p2.append(p3).append(p4).append(p5).append(p6).append(p8).append(p9).append(p11)
p=pd.DataFrame(pv1)

Prediction Heatmap

In [90]:
p[:]
Out[90]:
region China_Hubei Italy_nan Portugal_nan Spain_nan Germany_nan Switzerland_nan France_nan Iran_nan Belgium_nan Poland_nan United Kingdom_nan Russia_nan India_nan Australia_New South Wales Sweden_nan
date
2020-01-22 4.0 8.0 6.0 5.0 3.0 2.0 7.0 1.0 3.0 1.0 1.0 2.0 10.0 9.0 4.0
2020-01-23 444.0 8.0 6.0 5.0 3.0 2.0 6.0 1.0 3.0 1.0 1.0 2.0 8.0 14.0 6.0
2020-01-24 521.0 6.0 4.0 3.0 4.0 2.0 9.0 1.0 4.0 1.0 1.0 5.0 9.0 9.0 5.0
2020-01-25 668.0 5.0 4.0 4.0 1.0 2.0 10.0 1.0 4.0 1.0 1.0 6.0 11.0 7.0 4.0
2020-01-26 975.0 6.0 7.0 6.0 4.0 2.0 9.0 1.0 1.0 1.0 1.0 3.0 11.0 10.0 4.0
2020-01-27 1376.0 7.0 6.0 5.0 4.0 2.0 7.0 1.0 3.0 1.0 1.0 6.0 11.0 15.0 3.0
2020-01-28 1848.0 4.0 5.0 5.0 5.0 2.0 8.0 1.0 4.0 1.0 1.0 2.0 9.0 17.0 3.0
2020-01-29 4962.0 6.0 5.0 4.0 7.0 2.0 11.0 1.0 4.0 1.0 1.0 5.0 9.0 16.0 3.0
2020-01-30 4652.0 7.0 5.0 4.0 9.0 2.0 14.0 1.0 4.0 1.0 1.0 4.0 9.0 18.0 4.0
2020-01-31 6752.0 5.0 6.0 5.0 13.0 2.0 11.0 1.0 5.0 1.0 1.0 5.0 10.0 17.0 3.0
2020-02-01 7622.0 7.0 7.0 6.0 11.0 2.0 12.0 1.0 5.0 1.0 4.0 4.0 8.0 19.0 7.0
2020-02-02 9527.0 7.0 5.0 5.0 16.0 2.0 14.0 1.0 6.0 1.0 3.0 4.0 9.0 17.0 6.0
2020-02-03 15417.0 10.0 8.0 8.0 18.0 2.0 13.0 1.0 5.0 1.0 3.0 3.0 11.0 20.0 7.0
2020-02-04 18047.0 10.0 9.0 10.0 28.0 2.0 16.0 1.0 4.0 1.0 4.0 7.0 16.0 17.0 8.0
2020-02-05 22183.0 9.0 8.0 9.0 21.0 2.0 14.0 1.0 8.0 1.0 4.0 6.0 14.0 19.0 8.0
2020-02-06 25731.0 9.0 6.0 8.0 20.0 2.0 14.0 1.0 9.0 1.0 4.0 13.0 12.0 17.0 9.0
2020-02-07 28547.0 11.0 7.0 8.0 19.0 2.0 18.0 1.0 11.0 1.0 4.0 6.0 13.0 16.0 9.0
2020-02-08 32066.0 16.0 6.0 8.0 22.0 2.0 15.0 1.0 8.0 1.0 7.0 10.0 13.0 16.0 7.0
2020-02-09 34374.0 14.0 5.0 7.0 22.0 2.0 21.0 1.0 6.0 1.0 7.0 13.0 15.0 16.0 9.0
2020-02-10 37317.0 10.0 8.0 13.0 23.0 2.0 20.0 1.0 7.0 1.0 7.0 8.0 16.0 18.0 8.0
2020-02-11 39559.0 12.0 6.0 11.0 20.0 2.0 20.0 1.0 7.0 1.0 16.0 7.0 15.0 20.0 8.0
2020-02-12 41197.0 12.0 4.0 9.0 23.0 2.0 21.0 1.0 9.0 1.0 15.0 8.0 15.0 17.0 9.0
2020-02-13 40320.0 13.0 6.0 10.0 23.0 2.0 21.0 1.0 9.0 1.0 18.0 14.0 16.0 17.0 9.0
2020-02-14 63430.0 11.0 8.0 12.0 23.0 2.0 19.0 1.0 8.0 1.0 15.0 6.0 15.0 17.0 9.0
2020-02-15 69321.0 12.0 8.0 12.0 24.0 2.0 22.0 1.0 10.0 1.0 14.0 8.0 15.0 18.0 8.0
2020-02-16 70601.0 13.0 8.0 12.0 23.0 2.0 21.0 1.0 11.0 1.0 14.0 9.0 13.0 25.0 8.0
2020-02-17 71472.0 11.0 7.0 10.0 22.0 2.0 19.0 1.0 8.0 1.0 14.0 9.0 15.0 19.0 7.0
2020-02-18 73063.0 12.0 8.0 11.0 22.0 3.0 20.0 7.0 8.0 9.0 12.0 11.0 16.0 20.0 8.0
2020-02-19 74997.0 10.0 8.0 12.0 22.0 2.0 20.0 9.0 8.0 5.0 12.0 9.0 17.0 19.0 9.0
2020-02-20 74943.0 12.0 10.0 13.0 23.0 6.0 22.0 13.0 9.0 5.0 12.0 11.0 16.0 17.0 11.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-05-15 259520.0 300091.0 45715.0 333508.0 170331.0 100401.0 84885.0 131270.0 88965.0 9678.0 99164.0 4912.0 3117.0 18737.0 19893.0
2020-05-16 277128.0 325045.0 45671.0 333508.0 170278.0 100285.0 85193.0 131270.0 91090.0 9678.0 99828.0 4897.0 3260.0 18568.0 20628.0
2020-05-17 274468.0 320783.0 46418.0 345548.0 170152.0 105129.0 85146.0 131270.0 90863.0 9655.0 106182.0 4895.0 3214.0 18567.0 20555.0
2020-05-18 281151.0 323847.0 47766.0 342903.0 170129.0 104583.0 85146.0 135806.0 90965.0 9973.0 105191.0 4884.0 3251.0 20319.0 20555.0
2020-05-19 278690.0 339324.0 48090.0 348968.0 170129.0 104505.0 88899.0 139795.0 97291.0 9928.0 105002.0 4883.0 3230.0 20128.0 21301.0
2020-05-20 293161.0 336567.0 50233.0 352817.0 174136.0 110785.0 88319.0 139060.0 96579.0 10013.0 104500.0 4898.0 3335.0 20106.0 21223.0
2020-05-21 290516.0 354863.0 49945.0 368354.0 173511.0 110157.0 91214.0 146130.0 96574.0 10620.0 112812.0 4896.0 3302.0 20539.0 21627.0
2020-05-22 290509.0 350839.0 49922.0 370234.0 173495.0 111968.0 91755.0 144654.0 96268.0 10529.0 111539.0 4888.0 3299.0 20455.0 21552.0
2020-05-23 288733.0 349686.0 49822.0 369186.0 173219.0 111532.0 91550.0 144655.0 100450.0 10524.0 111503.0 4886.0 3521.0 20455.0 21530.0
2020-05-24 307606.0 383290.0 49799.0 396667.0 181244.0 115099.0 91353.0 154940.0 100003.0 10484.0 110639.0 4996.0 3481.0 21804.0 21512.0
2020-05-25 304408.0 377863.0 50631.0 392083.0 179993.0 115699.0 99530.0 153037.0 100000.0 11134.0 110639.0 4978.0 3696.0 21636.0 21512.0
2020-05-26 317173.0 377296.0 50527.0 391811.0 179993.0 115604.0 98082.0 152814.0 99592.0 11040.0 119551.0 4978.0 3642.0 21636.0 21488.0
2020-05-27 317626.0 374921.0 50458.0 389673.0 179249.0 115267.0 98082.0 159774.0 107460.0 11036.0 118186.0 4970.0 3637.0 21583.0 21488.0
2020-05-28 331443.0 374066.0 50419.0 388940.0 179249.0 121156.0 97978.0 164533.0 106634.0 10966.0 117798.0 4970.0 3761.0 22375.0 21476.0
2020-05-29 328406.0 400651.0 50419.0 395454.0 179481.0 120552.0 102188.0 163551.0 106769.0 10966.0 120819.0 5139.0 3738.0 23210.0 21476.0
2020-05-30 328119.0 421271.0 50420.0 394464.0 189739.0 120552.0 101518.0 166877.0 106254.0 10966.0 120265.0 5111.0 3726.0 23930.0 21476.0
2020-05-31 326285.0 415921.0 50420.0 393160.0 187752.0 120202.0 101478.0 165330.0 106254.0 10966.0 120265.0 5260.0 3879.0 24159.0 21476.0
2020-06-01 326285.0 414139.0 50393.0 392723.0 187736.0 128631.0 107977.0 165330.0 110503.0 11532.0 120020.0 5223.0 3841.0 24092.0 21476.0
2020-06-02 325697.0 412475.0 50393.0 392723.0 187027.0 132386.0 106872.0 168316.0 110056.0 11447.0 120207.0 5450.0 4144.0 25378.0 21476.0
2020-06-03 331112.0 449895.0 51598.0 404151.0 187027.0 131962.0 111751.0 167444.0 109804.0 11839.0 129316.0 5517.0 4081.0 25264.0 22429.0
2020-06-04 341484.0 444220.0 51477.0 406167.0 193174.0 137897.0 110485.0 168392.0 109607.0 11745.0 127908.0 5498.0 4334.0 27214.0 22334.0
2020-06-05 339688.0 442990.0 53002.0 405322.0 192204.0 146181.0 110281.0 167974.0 112828.0 12176.0 127698.0 5692.0 4268.0 26965.0 22334.0
2020-06-06 339314.0 453387.0 53535.0 404579.0 199922.0 145334.0 113400.0 167792.0 112490.0 12090.0 131517.0 5649.0 4436.0 26943.0 22292.0
2020-06-07 361316.0 451257.0 53461.0 414006.0 198218.0 154924.0 114146.0 167706.0 112490.0 12090.0 130837.0 5838.0 4763.0 26857.0 23670.0
2020-06-08 357873.0 451257.0 53393.0 412583.0 198219.0 157495.0 115835.0 170929.0 112204.0 12037.0 130837.0 6182.0 4699.0 26857.0 23532.0
2020-06-09 361364.0 482998.0 55730.0 412583.0 200996.0 156983.0 118034.0 170273.0 112204.0 12037.0 130508.0 6108.0 4671.0 28201.0 23532.0
2020-06-10 359057.0 478333.0 55454.0 411443.0 200474.0 171031.0 117292.0 179900.0 112204.0 12020.0 138709.0 6323.0 4643.0 28079.0 23442.0
2020-06-11 358510.0 485534.0 58235.0 422253.0 212282.0 169329.0 117141.0 178120.0 115701.0 12020.0 137388.0 6352.0 5132.0 28026.0 23443.0
2020-06-12 358277.0 482206.0 57802.0 423844.0 210164.0 169071.0 116953.0 186563.0 115231.0 12001.0 137388.0 6326.0 5048.0 29472.0 23443.0
2020-06-13 362739.0 494928.0 57779.0 441814.0 209774.0 168476.0 123551.0 196517.0 115231.0 12001.0 136598.0 6589.0 5119.0 29340.0 23443.0

144 rows × 15 columns

In [91]:
pip install gmplot
Requirement already satisfied: gmplot in /home/notebookuser/anaconda3/lib/python3.7/site-packages (1.2.0)
Requirement already satisfied: requests in /home/notebookuser/anaconda3/lib/python3.7/site-packages (from gmplot) (2.21.0)
Requirement already satisfied: idna<2.9,>=2.5 in /home/notebookuser/anaconda3/lib/python3.7/site-packages (from requests->gmplot) (2.8)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /home/notebookuser/anaconda3/lib/python3.7/site-packages (from requests->gmplot) (1.24.1)
Requirement already satisfied: certifi>=2017.4.17 in /home/notebookuser/anaconda3/lib/python3.7/site-packages (from requests->gmplot) (2019.3.9)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/notebookuser/anaconda3/lib/python3.7/site-packages (from requests->gmplot) (3.0.4)
Note: you may need to restart the kernel to use updated packages.
In [92]:
p.to_csv('/home/notebookuser/notebooks/covid19/p_confirmed_daily.csv', index = True)
In [93]:
# Import the necessary libraries
import pandas as pd
import gmplot
# For improved table display in the notebook
#from IPython.display import display
import random 
In [94]:
heatmap=confirmed[['region','Lat','Long']]
p_m=p.T # pred.T #
heatmap=heatmap[heatmap['region'].isin(region)]
p_m=p_m.reset_index()
heatmap_m=heatmap.merge(p_m,how='left',on='region')
In [95]:
heatmap=pd.DataFrame()
vmaxni=len(heatmap_m)
#vmaxnii=(i+1)
for i in range(0,len(heatmap)):
    if heatmap_m.iloc[i,vmaxni].astype(int)==0:     #### heatmap_m.iloc[i,61] # heatmap_m.iloc[i,9] columns is the date we want to check
        continue
    heatmap=heatmap.append(pd.concat([heatmap_m.iloc[i:(i+1),1:3]]*abs(heatmap_m.iloc[i,(i+1)].astype(int)), ignore_index=True,sort=False)) 
In [96]:
### Joao - Print in datetime
from datetime import datetime
#
latitudes = heatmap_m['Lat']
longitudes = heatmap_m['Long']
# Creating the location we would like to initialize the focus on. 
# Parameters: Lattitude, Longitude, Zoom
heatmap=heatmap_m
gmap = gmplot.GoogleMapPlotter(46.99474,6.87237, 4)
gmap.heatmap(latitudes, longitudes)
datemap=datetime.today().strftime('%Y-%m-%d')
# Generate the heatmap into an HTML file
gmap.draw("Heatmap-"+datemap+".html")
In [97]:
exit()
H2O session _sid_a72d closed.
ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.
In [ ]: